blob: d9f2fc55709a6af37be1f2f0a893abb56fad130a [file]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1))
declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1))
declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3))
declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3))
declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3))
declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3))
declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3))
define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr4_b64_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr4_b64 v[0:1], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr4_b64_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr8_b64_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr8_b64 v[0:1], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr8_b64_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr6_b96_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr6_b96 v[4:6], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b96 v[2:3], v[4:6], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
store <3 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr6_b96_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
store <3 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
store <8 x bfloat> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
store <8 x bfloat> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-SDAG-LABEL: ds_load_tr4_b64:
; GFX1250-SDAG: ; %bb.0: ; %entry
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: ds_load_tr4_b64:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-SDAG-LABEL: ds_load_tr8_b64:
; GFX1250-SDAG: ; %bb.0: ; %entry
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: ds_load_tr8_b64:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX1250-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-SDAG-LABEL: ds_load_tr6_b96:
; GFX1250-SDAG: ; %bb.0: ; %entry
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX1250-SDAG-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: global_store_b96 v[4:5], v[0:2], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: ds_load_tr6_b96:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: global_store_b96 v[4:5], v[0:2], off
; GFX1250-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
store <3 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16:
; GFX1250-SDAG: ; %bb.0: ; %entry
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX1250-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16:
; GFX1250-SDAG: ; %bb.0: ; %entry
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX1250-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3) %gep)
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: ds_load_tr16_b128_v8bf16:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX1250-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3) %gep)
store <8 x bfloat> %val, ptr addrspace(1) %use
ret void
}
; This is a special case that does not require aligned VGPRs. Make
; sure no copies are required for the unaligned ABI return value.
define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
ret { i32, <3 x i32> } %insert1
}
define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
ret { i32, <3 x i32> } %insert1
}
define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: ds_load_tr6_b96 v[2:4], v0 offset:32
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
%insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
%insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
ret { i32, <3 x i32> } %insert1
}