blob: be351ea026a03a2ab7f21a4e83f4680868839949 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
; CHECK-NEXT: v_mov_b32_e32 v31, v20
; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %if
; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off
; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off
; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off
; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off
; CHECK-NEXT: s_wait_loadcnt 0x2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35
; CHECK-NEXT: s_wait_loadcnt 0x1
; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39
; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1
; CHECK-NEXT: .LBB0_2: ; %end
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[21:24]
; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: global_store_b32 v[12:13], v1, off
; CHECK-NEXT: global_store_b32 v[14:15], v0, off
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: global_store_b32 v[16:17], v20, off
; CHECK-NEXT: global_store_b32 v[18:19], v21, off
; CHECK-NEXT: s_endpgm
entry:
%ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
%a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
%val.0 = extractelement <10 x i32> %a, i32 0
%val.1 = extractelement <10 x i32> %a, i32 1
%val.2 = extractelement <10 x i32> %a, i32 2
%val.3 = extractelement <10 x i32> %a, i32 3
%bvh.0 = insertelement <4 x i32> poison, i32 %val.0, i32 0
%bvh.1 = insertelement <4 x i32> %bvh.0, i32 %val.1, i32 1
%bvh.2 = insertelement <4 x i32> %bvh.1, i32 %val.2, i32 2
%bvh = insertelement <4 x i32> %bvh.2, i32 %val.3, i32 3
%cnd = fcmp oeq float %ray_origin_x, 0.0
br i1 %cnd, label %if, label %end
if:
; loads to force vgpr pressure
%load.0 = load <2 x i32>, ptr addrspace(1) %p.0
%load.1 = load <2 x i32>, ptr addrspace(1) %p.1
%load.2 = load <2 x i32>, ptr addrspace(1) %p.2
%load.3 = load <2 x i32>, ptr addrspace(1) %p.3
%add.0 = add <2 x i32> %load.0, %load.1
%add.1 = add <2 x i32> %add.0, %load.2
%add.2 = add <2 x i32> %add.1, %load.3
%.i0 = extractelement <2 x i32> %add.2, i32 0
%.i1 = extractelement <2 x i32> %add.2, i32 1
%data.1 = add i32 %.i0, %.i1
br label %end
end:
%data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data, <4 x i32> %bvh, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
; keep all intersect ray parameters live
%new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
%new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
%v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
%b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
%c = extractelement <10 x i32> %b, i32 0
%d = extractelement <10 x i32> %b, i32 1
; stores keep pointers live
store i32 %vdst, ptr addrspace(1) %p.0
store i32 %newaddr, ptr addrspace(1) %p.1
store i32 %c, ptr addrspace(1) %p.2
store i32 %d, ptr addrspace(1) %p.3
ret void
}
define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
; CHECK-NEXT: v_mov_b32_e32 v31, v20
; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20
; CHECK-NEXT: s_cbranch_execz .LBB1_2
; CHECK-NEXT: ; %bb.1: ; %if
; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off
; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off
; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off
; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off
; CHECK-NEXT: s_wait_loadcnt 0x2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35
; CHECK-NEXT: s_wait_loadcnt 0x1
; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39
; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1
; CHECK-NEXT: .LBB1_2: ; %end
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[21:28]
; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: global_store_b32 v[12:13], v1, off
; CHECK-NEXT: global_store_b32 v[14:15], v0, off
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: global_store_b32 v[16:17], v20, off
; CHECK-NEXT: global_store_b32 v[18:19], v21, off
; CHECK-NEXT: s_endpgm
entry:
%ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
%a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
%val.0 = extractelement <10 x i32> %a, i32 0
%val.1 = extractelement <10 x i32> %a, i32 1
%val.2 = extractelement <10 x i32> %a, i32 2
%val.3 = extractelement <10 x i32> %a, i32 3
%val.4 = extractelement <10 x i32> %a, i32 4
%val.5 = extractelement <10 x i32> %a, i32 5
%val.6 = extractelement <10 x i32> %a, i32 6
%val.7 = extractelement <10 x i32> %a, i32 7
%bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0
%bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1
%bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2
%bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3
%bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4
%bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5
%bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6
%bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7
%cnd = fcmp oeq float %ray_origin_x, 0.0
br i1 %cnd, label %if, label %end
if:
; loads to force vgpr pressure
%load.0 = load <2 x i32>, ptr addrspace(1) %p.0
%load.1 = load <2 x i32>, ptr addrspace(1) %p.1
%load.2 = load <2 x i32>, ptr addrspace(1) %p.2
%load.3 = load <2 x i32>, ptr addrspace(1) %p.3
%add.0 = add <2 x i32> %load.0, %load.1
%add.1 = add <2 x i32> %add.0, %load.2
%add.2 = add <2 x i32> %add.1, %load.3
%.i0 = extractelement <2 x i32> %add.2, i32 0
%.i1 = extractelement <2 x i32> %add.2, i32 1
%data.1 = add i32 %.i0, %.i1
br label %end
end:
%data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
; keep all intersect ray parameters live
%new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
%new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
%v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
%b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
%c = extractelement <10 x i32> %b, i32 0
%d = extractelement <10 x i32> %b, i32 1
; stores keep pointers live
store i32 %vdst, ptr addrspace(1) %p.0
store i32 %newaddr, ptr addrspace(1) %p.1
store i32 %c, ptr addrspace(1) %p.2
store i32 %d, ptr addrspace(1) %p.3
ret void
}
define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
; CHECK-NEXT: v_mov_b32_e32 v31, v20
; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20
; CHECK-NEXT: s_cbranch_execz .LBB2_2
; CHECK-NEXT: ; %bb.1: ; %if
; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off
; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off
; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off
; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off
; CHECK-NEXT: s_wait_loadcnt 0x2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35
; CHECK-NEXT: s_wait_loadcnt 0x1
; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39
; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1
; CHECK-NEXT: .LBB2_2: ; %end
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[6:7], v0, v1, v[21:28]
; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: global_store_b64 v[12:13], v[6:7], off
; CHECK-NEXT: global_store_b32 v[14:15], v0, off
; CHECK-NEXT: s_wait_bvhcnt 0x0
; CHECK-NEXT: global_store_b32 v[16:17], v20, off
; CHECK-NEXT: global_store_b32 v[18:19], v21, off
; CHECK-NEXT: s_endpgm
entry:
%ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
%a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
%val.0 = extractelement <10 x i32> %a, i32 0
%val.1 = extractelement <10 x i32> %a, i32 1
%val.2 = extractelement <10 x i32> %a, i32 2
%val.3 = extractelement <10 x i32> %a, i32 3
%val.4 = extractelement <10 x i32> %a, i32 4
%val.5 = extractelement <10 x i32> %a, i32 5
%val.6 = extractelement <10 x i32> %a, i32 6
%val.7 = extractelement <10 x i32> %a, i32 7
%bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0
%bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1
%bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2
%bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3
%bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4
%bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5
%bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6
%bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7
%cnd = fcmp oeq float %ray_origin_x, 0.0
br i1 %cnd, label %if, label %end
if:
; loads to force vgpr pressure
%load.0 = load <2 x i32>, ptr addrspace(1) %p.0
%load.1 = load <2 x i32>, ptr addrspace(1) %p.1
%load.2 = load <2 x i32>, ptr addrspace(1) %p.2
%load.3 = load <2 x i32>, ptr addrspace(1) %p.3
%add.0 = add <2 x i32> %load.0, %load.1
%add.1 = add <2 x i32> %add.0, %load.2
%add.2 = add <2 x i32> %add.1, %load.3
%.i0 = extractelement <2 x i32> %add.2, i32 0
%.i1 = extractelement <2 x i32> %add.2, i32 1
%data.1 = add i32 %.i0, %.i1
br label %end
end:
%data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
%pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0)
%vdst = extractvalue { i64, i32 } %pair, 0
%newaddr = extractvalue { i64, i32 } %pair, 1
; keep all intersect ray parameters live
%new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
%new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
%v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
%b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
%c = extractelement <10 x i32> %b, i32 0
%d = extractelement <10 x i32> %b, i32 1
; stores keep pointers live
store i64 %vdst, ptr addrspace(1) %p.0
store i32 %newaddr, ptr addrspace(1) %p.1
store i32 %c, ptr addrspace(1) %p.2
store i32 %d, ptr addrspace(1) %p.3
ret void
}