test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll - llvm-project/llvm - Git at Google

 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)

 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
 declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)

 ; GCN-LABEL: {{^}}image_bvh_intersect_ray:
 ; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
 ; Arguments are flattened to represent the actual VGPR_A layout, so we have no
 ; extra moves in the generated kernel.
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 main_body:
   %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
   %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
   %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
   %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
   %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
   %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
   %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }

 ; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16:
 ; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
 main_body:
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }

 ; GCN-LABEL: {{^}}image_bvh64_intersect_ray:
 ; GCN:  image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
 ; Arguments are flattened to represent the actual VGPR_A layout, so we have no
 ; extra moves in the generated kernel.
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
 main_body:
   %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
   %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
   %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
   %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
   %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
   %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
   %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
   %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }

 ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16:
 ; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
 main_body:
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
   ret <4 x float> %r
 }

 ; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.

 ; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign:
 ; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
   %node_ptr = load i32, i32* %gep_node_ptr, align 4
   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
   %ray_extent = load float, float* %gep_ray, align 4
   %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
   %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
   %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
   %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
   %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
   %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
   %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   store <4 x i32> %v, <4 x i32>* undef
   ret void
 }

 ; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign:
 ; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
 define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
   %node_ptr = load i32, i32* %gep_node_ptr, align 4
   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
   %ray_extent = load float, float* %gep_ray, align 4
   %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
   %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
   %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
   %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
   %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
   %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
   %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   store <4 x i32> %v, <4 x i32>* undef
   ret void
 }

 ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign:
 ; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
   %ray_extent = load float, float* %gep_ray, align 4
   %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
   %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
   %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
   %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
   %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
   %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
   %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
   store <4 x i32> %v, <4 x i32>* undef
   ret void
 }

 ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign:
 ; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
 define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
 main_body:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
   %ray_extent = load float, float* %gep_ray, align 4
   %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
   %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
   %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
   %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
   %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
   %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
   %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
   %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
   %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
   store <4 x i32> %v, <4 x i32>* undef
   ret void
 }

 declare i32 @llvm.amdgcn.workitem.id.x()
	; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
	; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
	; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
	; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)

	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
	declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)

	; GCN-LABEL: {{^}}image_bvh_intersect_ray:
	; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
	; Arguments are flattened to represent the actual VGPR_A layout, so we have no
	; extra moves in the generated kernel.
	define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
	main_body:
	%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
	%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
	%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
	%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
	%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
	%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
	%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r
	}

	; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16:
	; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
	define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
	main_body:
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r
	}

	; GCN-LABEL: {{^}}image_bvh64_intersect_ray:
	; GCN: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
	; Arguments are flattened to represent the actual VGPR_A layout, so we have no
	; extra moves in the generated kernel.
	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
	main_body:
	%node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
	%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
	%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
	%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
	%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
	%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
	%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
	%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r
	}

	; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16:
	; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
	define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
	main_body:
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	%r = bitcast <4 x i32> %v to <4 x float>
	ret <4 x float> %r
	}

	; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.

	; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign:
	; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
	define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
	main_body:
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
	%node_ptr = load i32, i32* %gep_node_ptr, align 4
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4
	%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
	%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
	%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
	%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
	%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
	%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
	%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	store <4 x i32> %v, <4 x i32>* undef
	ret void
	}

	; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign:
	; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
	define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
	main_body:
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
	%node_ptr = load i32, i32* %gep_node_ptr, align 4
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4
	%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
	%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
	%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
	%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
	%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
	%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
	%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	store <4 x i32> %v, <4 x i32>* undef
	ret void
	}

	; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign:
	; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
	define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
	main_body:
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4
	%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
	%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
	%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
	%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
	%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
	%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
	%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
	store <4 x i32> %v, <4 x i32>* undef
	ret void
	}

	; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign:
	; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
	define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
	main_body:
	%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
	%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
	%ray_extent = load float, float* %gep_ray, align 4
	%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
	%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
	%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
	%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
	%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
	%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
	%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
	%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
	%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
	%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
	store <4 x i32> %v, <4 x i32>* undef
	ret void
	}

	declare i32 @llvm.amdgcn.workitem.id.x()