| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN |
| ; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefixes=GFX7 |
| ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX8 |
| ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX9 |
| ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 |
| |
| ; We only have storage-only BF16 support. We can load/store those values as we treat them as u16, but |
| ; we don't support operations on them. As such, codegen is expected to fail for any operation other |
| ; than simple load/stores. |
| |
| define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_short v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_short v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_short v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load bfloat, ptr addrspace(1) %in |
| store bfloat %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_f32_to_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_f32_to_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_f32_to_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: flat_store_short v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_f32_to_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_f32_to_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load float, ptr addrspace(1) %in |
| %val.bf16 = fptrunc float %val to bfloat |
| store bfloat %val.bf16, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_f64_to_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_f64_to_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_f64_to_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: flat_store_short v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_f64_to_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_f64_to_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load double, ptr addrspace(1) %in |
| %val.bf16 = fptrunc double %val to bfloat |
| store bfloat %val.bf16, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_bf16_to_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_bf16_to_f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_bf16_to_f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: flat_store_dword v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_bf16_to_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: global_load_short_d16_hi v4, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dword v[2:3], v4, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_bf16_to_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX10-NEXT: global_load_short_d16_hi v4, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dword v[2:3], v4, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load bfloat, ptr addrspace(1) %in |
| %val.f32 = fpext bfloat %val to float |
| store float %val.f32, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_bf16_to_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 |
| ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_bf16_to_f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 |
| ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_bf16_to_f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 |
| ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_bf16_to_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: global_load_short_d16_hi v4, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 |
| ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_bf16_to_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX10-NEXT: global_load_short_d16_hi v4, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 |
| ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load bfloat, ptr addrspace(1) %in |
| %val.f64 = fpext bfloat %val to double |
| store double %val.f64, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_dword v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_dword v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dword v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load <2 x bfloat>, ptr addrspace(1) %in |
| store <2 x bfloat> %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load <4 x bfloat>, ptr addrspace(1) %in |
| store <4 x bfloat> %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_v8bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_v8bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_v8bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_v8bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_v8bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load <8 x bfloat>, ptr addrspace(1) %in |
| store <8 x bfloat> %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_load_store_v16bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 |
| ; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(1) |
| ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 |
| ; GCN-NEXT: s_waitcnt vmcnt(1) |
| ; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_load_store_v16bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 |
| ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_load_store_v16bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 |
| ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc |
| ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] |
| ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] |
| ; GFX8-NEXT: s_waitcnt vmcnt(1) |
| ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_load_store_v16bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 |
| ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_load_store_v16bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 |
| ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load <16 x bfloat>, ptr addrspace(1) %in |
| store <16 x bfloat> %val, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_arg_store: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: flat_store_short v[1:2], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_short_d16_hi v[1:2], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_short_d16_hi v[1:2], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store bfloat %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_arg_store_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_store_dword v[1:2], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_dword v[1:2], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dword v[1:2], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store <2 x bfloat> %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) { |
| ; GCN-LABEL: test_arg_store_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 |
| ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 |
| ; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_store_dword v[2:3], v0 |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2 |
| ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
| ; GFX8-NEXT: flat_store_short v[2:3], v1 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4 |
| ; GFX9-NEXT: global_store_dword v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4 |
| ; GFX10-NEXT: global_store_dword v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out |
| ret void |
| } |
| |
| define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_arg_store_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 |
| ; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store <4 x bfloat> %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_arg_store_v8bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 |
| ; GCN-NEXT: v_alignbit_b32 v4, v10, v4, 16 |
| ; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 |
| ; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 |
| ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store_v8bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store_v8bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store_v8bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store_v8bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store <8 x bfloat> %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_arg_store_v16bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 |
| ; GCN-NEXT: v_alignbit_b32 v4, v18, v4, 16 |
| ; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16 |
| ; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 |
| ; GCN-NEXT: v_alignbit_b32 v12, v19, v12, 16 |
| ; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 |
| ; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 |
| ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16 |
| ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_arg_store_v16bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15 |
| ; GFX7-NEXT: v_alignbit_b32 v14, v0, v14, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v13 |
| ; GFX7-NEXT: v_alignbit_b32 v13, v0, v12, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11 |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: v_alignbit_b32 v12, v0, v10, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: v_alignbit_b32 v11, v0, v8, 16 |
| ; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 |
| ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_arg_store_v16bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] |
| ; GFX8-NEXT: s_nop 0 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8 |
| ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc |
| ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_arg_store_v16bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 |
| ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_arg_store_v16bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 |
| ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store <16 x bfloat> %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_inreg_arg_store: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_lshr_b32 s34, s4, 16 |
| ; GCN-NEXT: s_mov_b32 s38, 0 |
| ; GCN-NEXT: s_mov_b32 s39, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s36, s38 |
| ; GCN-NEXT: s_mov_b32 s37, s38 |
| ; GCN-NEXT: v_mov_b32_e32 v2, s34 |
| ; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_inreg_arg_store: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_lshr_b32 s34, s4, 16 |
| ; GFX7-NEXT: s_mov_b32 s38, 0 |
| ; GFX7-NEXT: s_mov_b32 s39, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s36, s38 |
| ; GFX7-NEXT: s_mov_b32 s37, s38 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s34 |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_inreg_arg_store: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_lshr_b32 s34, s4, 16 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s34 |
| ; GFX8-NEXT: flat_store_short v[0:1], v2 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_inreg_arg_store: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_inreg_arg_store: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX10-NEXT: global_store_short_d16_hi v[0:1], v2, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store bfloat %in, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { |
| ; GCN-LABEL: test_byval: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
| ; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_byval: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_byval: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
| ; GFX8-NEXT: buffer_store_short v1, off, s[0:3], s32 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_byval: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_byval: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store bfloat %val, ptr addrspace(5) %bv |
| %retval = load bfloat, ptr addrspace(5) %bv |
| ret bfloat %retval |
| } |
| |
| define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { |
| ; GCN-LABEL: test_sret: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_sret: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_sret: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_sret: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_sret: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| store bfloat %val, ptr addrspace(5) %sret |
| ret void |
| } |
| |
| define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; GCN-LABEL: test_bitcast_from_bfloat: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_bitcast_from_bfloat: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_bitcast_from_bfloat: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_short v[2:3], v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_bitcast_from_bfloat: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_short v[2:3], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_bitcast_from_bfloat: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_short v[2:3], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load bfloat, ptr addrspace(1) %in |
| %val_int = bitcast bfloat %val to i16 |
| store i16 %val_int, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) { |
| ; GCN-LABEL: test_bitcast_to_bfloat: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s6, 0 |
| ; GCN-NEXT: s_mov_b32 s7, 0xf000 |
| ; GCN-NEXT: s_mov_b32 s4, s6 |
| ; GCN-NEXT: s_mov_b32 s5, s6 |
| ; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_bitcast_to_bfloat: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s6, 0 |
| ; GFX7-NEXT: s_mov_b32 s7, 0xf000 |
| ; GFX7-NEXT: s_mov_b32 s4, s6 |
| ; GFX7-NEXT: s_mov_b32 s5, s6 |
| ; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_bitcast_to_bfloat: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: flat_load_ushort v2, v[2:3] |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: flat_store_short v[0:1], v2 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_bitcast_to_bfloat: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: global_load_ushort v2, v[2:3], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_short v[0:1], v2, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_bitcast_to_bfloat: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_load_ushort v2, v[2:3], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_short v[0:1], v2, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %val = load i16, ptr addrspace(1) %in |
| %val_fp = bitcast i16 %val to bfloat |
| store bfloat %val_fp, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define bfloat @test_ret(bfloat %in) { |
| ; GCN-LABEL: test_ret: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret bfloat %in |
| } |
| |
| define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) { |
| ; GCN-LABEL: test_ret_v2bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret_v2bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret_v2bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret_v2bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret_v2bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret <2 x bfloat> %in |
| } |
| |
| define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { |
| ; GCN-LABEL: test_ret_v3bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret_v3bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret_v3bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret_v3bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 |
| ; GFX9-NEXT: s_mov_b32 s4, 0xffff |
| ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret_v3bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret <3 x bfloat> %in |
| } |
| |
| define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) { |
| ; GCN-LABEL: test_ret_v4bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret_v4bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret_v4bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret_v4bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret_v4bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret <4 x bfloat> %in |
| } |
| |
| define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) { |
| ; GCN-LABEL: test_ret_v8bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret_v8bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret_v8bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret_v8bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret_v8bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret <8 x bfloat> %in |
| } |
| |
| define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) { |
| ; GCN-LABEL: test_ret_v16bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_ret_v16bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_ret_v16bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_ret_v16bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_ret_v16bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| ret <16 x bfloat> %in |
| } |
| |
| define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call bfloat @test_arg_store(bfloat %in) |
| store volatile bfloat %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v2bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v2 |
| ; GCN-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v2bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 2, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v2bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v2bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call_v2bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in) |
| store volatile <2 x bfloat> %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v3bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v4, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v4, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3 |
| ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v4, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v4, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v3bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 |
| ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v3bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v2 |
| ; GFX8-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v3bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 |
| ; GFX9-NEXT: s_mov_b32 s4, 0xffff |
| ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call_v3bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 |
| ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in) |
| store volatile <3 x bfloat> %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v4bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_add_i32_e32 v6, vcc, 6, v4 |
| ; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v4 |
| ; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v4 |
| ; GCN-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v2, v7, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v5, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v5, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v4bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 6, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: buffer_store_short v3, v6, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v5, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v5, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v4bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 |
| ; GFX8-NEXT: buffer_store_short v1, v6, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2 |
| ; GFX8-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 |
| ; GFX8-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v4bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call_v4bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in) |
| store volatile <4 x bfloat> %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v8bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_add_i32_e32 v10, vcc, 14, v8 |
| ; GCN-NEXT: v_add_i32_e32 v11, vcc, 12, v8 |
| ; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v8 |
| ; GCN-NEXT: v_add_i32_e32 v13, vcc, 8, v8 |
| ; GCN-NEXT: v_add_i32_e32 v14, vcc, 6, v8 |
| ; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v8 |
| ; GCN-NEXT: v_add_i32_e32 v16, vcc, 2, v8 |
| ; GCN-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v6, v11, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v4, v13, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v3, v14, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v1, v16, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v9, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v9, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v8bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 14, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: buffer_store_short v7, v10, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v9, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v9, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v8bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 12, v4 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 |
| ; GFX8-NEXT: buffer_store_short v3, v10, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 |
| ; GFX8-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 |
| ; GFX8-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v4 |
| ; GFX8-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 |
| ; GFX8-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 |
| ; GFX8-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 |
| ; GFX8-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v5, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v5, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v8bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen offset:12 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen offset:8 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen offset:4 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v5, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call_v8bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen offset:12 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen offset:8 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v1, v4, s[0:3], 0 offen offset:4 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in) |
| store volatile <8 x bfloat> %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v16bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v17, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v17, s31, 1 |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: v_add_i32_e32 v18, vcc, 30, v16 |
| ; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v16 |
| ; GCN-NEXT: v_add_i32_e32 v20, vcc, 26, v16 |
| ; GCN-NEXT: v_add_i32_e32 v21, vcc, 24, v16 |
| ; GCN-NEXT: v_add_i32_e32 v22, vcc, 22, v16 |
| ; GCN-NEXT: v_add_i32_e32 v23, vcc, 20, v16 |
| ; GCN-NEXT: v_add_i32_e32 v24, vcc, 18, v16 |
| ; GCN-NEXT: v_add_i32_e32 v25, vcc, 16, v16 |
| ; GCN-NEXT: v_add_i32_e32 v26, vcc, 14, v16 |
| ; GCN-NEXT: v_add_i32_e32 v27, vcc, 12, v16 |
| ; GCN-NEXT: v_add_i32_e32 v28, vcc, 10, v16 |
| ; GCN-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v15, vcc, 8, v16 |
| ; GCN-NEXT: v_add_i32_e32 v18, vcc, 6, v16 |
| ; GCN-NEXT: buffer_store_short v14, v19, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v16 |
| ; GCN-NEXT: v_add_i32_e32 v19, vcc, 2, v16 |
| ; GCN-NEXT: buffer_store_short v13, v20, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v12, v21, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v11, v22, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v10, v23, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v9, v24, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v8, v25, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v7, v26, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v6, v27, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v5, v28, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v4, v15, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v3, v18, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v1, v19, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s31, v17, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v17, 0 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v16bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX7-NEXT: s_getpc_b64 s[4:5] |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX7-NEXT: v_writelane_b32 v17, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v17, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 30, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: buffer_store_short v15, v18, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_readlane_b32 s31, v17, 1 |
| ; GFX7-NEXT: v_readlane_b32 s30, v17, 0 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v16bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX8-NEXT: s_getpc_b64 s[4:5] |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX8-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX8-NEXT: v_add_u32_e32 v18, vcc, 28, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7 |
| ; GFX8-NEXT: buffer_store_short v7, v18, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6 |
| ; GFX8-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5 |
| ; GFX8-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4 |
| ; GFX8-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3 |
| ; GFX8-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2 |
| ; GFX8-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0 |
| ; GFX8-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 30, v8 |
| ; GFX8-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 26, v8 |
| ; GFX8-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 22, v8 |
| ; GFX8-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 18, v8 |
| ; GFX8-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v8 |
| ; GFX8-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1 |
| ; GFX8-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v8 |
| ; GFX8-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v8 |
| ; GFX8-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_readlane_b32 s31, v9, 1 |
| ; GFX8-NEXT: v_readlane_b32 s30, v9, 0 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v16bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen offset:28 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v6, v8, s[0:3], 0 offen offset:24 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v5, v8, s[0:3], 0 offen offset:20 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v4, v8, s[0:3], 0 offen offset:16 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v3, v8, s[0:3], 0 offen offset:12 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v2, v8, s[0:3], 0 offen offset:8 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen offset:4 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_readlane_b32 s31, v9, 1 |
| ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| ; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_call_v16bf16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen offset:28 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v6, v8, s[0:3], 0 offen offset:24 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v5, v8, s[0:3], 0 offen offset:20 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v4, v8, s[0:3], 0 offen offset:16 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v3, v8, s[0:3], 0 offen offset:12 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v2, v8, s[0:3], 0 offen offset:8 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v1, v8, s[0:3], 0 offen offset:4 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| ; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in) |
| store volatile <16 x bfloat> %result, ptr addrspace(5) %out |
| ret void |
| } |
| |
| define bfloat @test_alloca_load_store_ret(bfloat %in) { |
| ; GCN-LABEL: test_alloca_load_store_ret: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_alloca_load_store_ret: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_alloca_load_store_ret: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_alloca_load_store_ret: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_alloca_load_store_ret: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| %in.addr = alloca bfloat, align 2, addrspace(5) |
| store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 |
| %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2 |
| ret bfloat %loaded |
| } |
| |
| define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { |
| ; GCN-LABEL: test_overflow_stack: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 |
| ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 |
| ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 |
| ; GCN-NEXT: s_waitcnt vmcnt(2) |
| ; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 |
| ; GCN-NEXT: s_waitcnt vmcnt(2) |
| ; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 |
| ; GCN-NEXT: s_waitcnt vmcnt(2) |
| ; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 |
| ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 |
| ; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 |
| ; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0 |
| ; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0 |
| ; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 |
| ; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 |
| ; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 |
| ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 |
| ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 |
| ; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 |
| ; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen |
| ; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0 |
| ; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0 |
| ; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 |
| ; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 |
| ; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 |
| ; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 |
| ; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 |
| ; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0 |
| ; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0 |
| ; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0 |
| ; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0 |
| ; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 |
| ; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 |
| ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 |
| ; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen |
| ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_overflow_stack: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 |
| ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 |
| ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 |
| ; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 |
| ; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 |
| ; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 |
| ; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 |
| ; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 |
| ; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 |
| ; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 |
| ; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 |
| ; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 |
| ; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 |
| ; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 |
| ; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0 |
| ; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0 |
| ; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 |
| ; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0 |
| ; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0 |
| ; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 |
| ; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0 |
| ; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0 |
| ; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0 |
| ; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0 |
| ; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0 |
| ; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0 |
| ; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 |
| ; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0 |
| ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 |
| ; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 |
| ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 |
| ; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen |
| ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_overflow_stack: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 |
| ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 |
| ; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 |
| ; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 |
| ; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 |
| ; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 |
| ; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 |
| ; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 |
| ; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 |
| ; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 |
| ; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 |
| ; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 |
| ; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 |
| ; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 |
| ; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 |
| ; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0 |
| ; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0 |
| ; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 |
| ; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0 |
| ; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0 |
| ; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 |
| ; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0 |
| ; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 |
| ; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0 |
| ; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0 |
| ; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0 |
| ; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 |
| ; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0 |
| ; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 |
| ; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 |
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 |
| ; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen |
| ; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_overflow_stack: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 |
| ; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 |
| ; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 |
| ; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 |
| ; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 |
| ; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 |
| ; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 |
| ; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 |
| ; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 |
| ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 |
| ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 |
| ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 |
| ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 |
| ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 |
| ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 |
| ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 |
| ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 |
| ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 |
| ; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 |
| ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 |
| ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 |
| ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 |
| ; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 |
| ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 |
| ; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 |
| ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 |
| ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 |
| ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 |
| ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen |
| ; GFX9-NEXT: s_waitcnt vmcnt(18) |
| ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124 |
| ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120 |
| ; GFX9-NEXT: s_waitcnt vmcnt(18) |
| ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116 |
| ; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: test_overflow_stack: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_clause 0x2 |
| ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 |
| ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 |
| ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 |
| ; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 |
| ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 |
| ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 |
| ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 |
| ; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 |
| ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 |
| ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 |
| ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 |
| ; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 |
| ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 |
| ; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 |
| ; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 |
| ; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 |
| ; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 |
| ; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 |
| ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 |
| ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 |
| ; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 |
| ; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 |
| ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 |
| ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 |
| ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 |
| ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 |
| ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 |
| ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen |
| ; GFX10-NEXT: s_waitcnt vmcnt(2) |
| ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124 |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116 |
| ; GFX10-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128 |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 |
| %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 |
| ret { <32 x i32>, bfloat } %ins.1 |
| } |