llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll - llvm-project - Git at Google

 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; The bitcast should be pushed through the bitcasts so the vectors can
 ; be broken down and the shared components can be CSEd

 ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(ptr addrspace(1) %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

   %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
   store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }

 ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(ptr addrspace(1) %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

   %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
   store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }

 ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, ptr addrspace(1) %out

   %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
   store volatile <4 x double> %vec1.bc, ptr addrspace(1) %out
   ret void
 }

 ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(1) %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

   %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
   store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }

 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
 define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
   %bc = bitcast i64 %undef to <2 x i32>
   store <2 x i32> %bc, ptr addrspace(1) %out
   ret void
 }

 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
 define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
   store i32 %elt1, ptr addrspace(1) %out
   ret void
 }

 declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone convergent }
	; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; The bitcast should be pushed through the bitcasts so the vectors can
	; be broken down and the shared components can be CSEd

	; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
	; GCN: buffer_store_dwordx4
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(ptr addrspace(1) %out, <8 x i32> %vec) {
	%vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
	store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

	%vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
	store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
	ret void
	}

	; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
	; GCN: buffer_store_dwordx4
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(ptr addrspace(1) %out, <4 x i64> %vec) {
	%vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
	store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

	%vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
	store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
	ret void
	}

	; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
	; GCN: buffer_store_dwordx4
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %vec) {
	%vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
	store volatile <4 x double> %vec0.bc, ptr addrspace(1) %out

	%vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
	store volatile <4 x double> %vec1.bc, ptr addrspace(1) %out
	ret void
	}

	; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
	; GCN: buffer_store_dwordx4
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	; GCN-NOT: v_mov_b32
	; GCN: buffer_store_dwordx4
	define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(1) %out, <16 x i16> %vec) {
	%vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
	store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

	%vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
	store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
	ret void
	}

	; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
	; GCN-NOT: store_dword
	define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
	%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
	%bc = bitcast i64 %undef to <2 x i32>
	store <2 x i32> %bc, ptr addrspace(1) %out
	ret void
	}

	; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
	; GCN-NOT: store_dword
	define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
	%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
	%bc = bitcast i64 %undef to <2 x i32>
	%elt1 = extractelement <2 x i32> %bc, i32 1
	store i32 %elt1, ptr addrspace(1) %out
	ret void
	}

	declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1

	attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone convergent }