[GVN] Add extra vscale tests with different types. NFC
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index 71adaed..f6e0f8c 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -387,3 +387,257 @@ if.else: ret void } + +; Different sizes / types + +define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 4 x float>, ptr %p + ret <vscale x 4 x float> %load +} + +define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x) { +; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( +; CHECK-NEXT: store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]] +; + store <vscale x 16 x i8> %x, ptr %p + %load = load <vscale x 4 x float>, ptr %p + ret <vscale x 4 x float> %load +} + +define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x) { +; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( +; CHECK-NEXT: store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]] +; + store <vscale x 4 x float> %x, ptr %p + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x i32> @load_v4i32_store_v4i64_forward_load(ptr %p, <vscale x 4 x i64> %x) { +; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load( +; CHECK-NEXT: store <vscale x 4 x i64> [[X:%.*]], ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]] +; + store <vscale x 4 x i64> %x, ptr %p + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x i64> @load_v4i64_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[P]], align 32 +; CHECK-NEXT: ret <vscale x 4 x i64> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 4 x i64>, ptr %p + ret <vscale x 4 x i64> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 2 x i32>, ptr %p + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %q = getelementptr <vscale x 2 x i32>, ptr %p, i64 1 + %load = load <vscale x 2 x i32>, ptr %q + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %q = getelementptr <2 x i32>, ptr %p, i64 1 + %load = load <vscale x 2 x i32>, ptr %q + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 2 x ptr> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 2 x ptr>, ptr %p + ret <vscale x 2 x ptr> %load +} + +define <vscale x 2 x i64> @load_v2i64_store_v2p0_forward_load(ptr %p, <vscale x 2 x ptr> %x) { +; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( +; CHECK-NEXT: store <vscale x 2 x ptr> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 2 x i64> [[LOAD]] +; + store <vscale x 2 x ptr> %x, ptr %p + %load = load <vscale x 2 x i64>, ptr %p + ret <vscale x 2 x i64> %load +} + +define <vscale x 16 x i8> @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x) { +; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <4 x i32> %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <16 x i8>, ptr %p + ret <16 x i8> %load +} + +define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( +; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> splat (i32 4), ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <vscale x 16 x i8> @load_v16i8_struct_store_v4i32_forward_load(ptr %p, { <vscale x 4 x i32> } %x) { +; CHECK-LABEL: @load_v16i8_struct_store_v4i32_forward_load( +; CHECK-NEXT: store { <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store { <vscale x 4 x i32> } %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define {<vscale x 16 x i8>} @load_v16i8_store_v4i32_struct_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_struct_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load { <vscale x 16 x i8> }, ptr [[P]], align 16 +; CHECK-NEXT: ret { <vscale x 16 x i8> } [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load { <vscale x 16 x i8> }, ptr %p + ret { <vscale x 16 x i8> } %load +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @bigexample({ <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a) vscale_range(1,16) { +; CHECK-LABEL: @bigexample( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A:%.*]], 0 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT]], ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 1 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5 +; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 2 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48 +; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]] +; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0 +; CHECK-NEXT: [[DOTUNPACK8:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1 +; CHECK-NEXT: [[DOTUNPACK10:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2 +; CHECK-NEXT: [[DOTUNPACK12:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]] +; +entry: + %ref.tmp = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp) + %a.elt = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 0 + store <vscale x 4 x i32> %a.elt, ptr %ref.tmp, align 16 + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 4 + %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1 + %a.elt2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 1 + store <vscale x 4 x i32> %a.elt2, ptr %ref.tmp.repack1, align 16 + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 5 + %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3 + %a.elt4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 2 + store <vscale x 4 x i32> %a.elt4, ptr %ref.tmp.repack3, align 16 + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 48 + %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5 + %a.elt6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 3 + store <vscale x 4 x i32> %a.elt6, ptr %ref.tmp.repack5, align 16 + %.unpack = load <vscale x 16 x i8>, ptr %ref.tmp, align 16 + %6 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %.unpack, 0 + %7 = call i64 @llvm.vscale.i64() + %8 = shl i64 %7, 4 + %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8 + %.unpack8 = load <vscale x 16 x i8>, ptr %.elt7, align 16 + %9 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, <vscale x 16 x i8> %.unpack8, 1 + %10 = call i64 @llvm.vscale.i64() + %11 = shl i64 %10, 5 + %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11 + %.unpack10 = load <vscale x 16 x i8>, ptr %.elt9, align 16 + %12 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %9, <vscale x 16 x i8> %.unpack10, 2 + %13 = call i64 @llvm.vscale.i64() + %14 = mul i64 %13, 48 + %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14 + %.unpack12 = load <vscale x 16 x i8>, ptr %.elt11, align 16 + %15 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %12, <vscale x 16 x i8> %.unpack12, 3 + call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15 +}
diff --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll new file mode 100644 index 0000000..500d58b --- /dev/null +++ b/llvm/test/Transforms/NewGVN/vscale.ll
@@ -0,0 +1,648 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -passes=newgvn,dce | FileCheck %s + +; Analyze Load from clobbering Load. + +define <vscale x 4 x i32> @load_store_clobber_load(ptr %p) { +; CHECK-LABEL: @load_store_clobber_load( +; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr undef, align 16 +; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]] +; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]] +; + %load1 = load <vscale x 4 x i32>, ptr %p + store <vscale x 4 x i32> zeroinitializer, ptr undef + %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated + %add = add <vscale x 4 x i32> %load1, %load2 + ret <vscale x 4 x i32> %add +} + +define <vscale x 4 x i32> @load_store_clobber_load_mayalias(ptr %p, ptr %p2) { +; CHECK-LABEL: @load_store_clobber_load_mayalias( +; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16 +; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]] +; + %load1 = load <vscale x 4 x i32>, ptr %p + store <vscale x 4 x i32> zeroinitializer, ptr %p2 + %load2 = load <vscale x 4 x i32>, ptr %p + %sub = sub <vscale x 4 x i32> %load1, %load2 + ret <vscale x 4 x i32> %sub +} + +define <vscale x 4 x i32> @load_store_clobber_load_noalias(ptr noalias %p, ptr noalias %p2) { +; CHECK-LABEL: @load_store_clobber_load_noalias( +; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P2:%.*]], align 16 +; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]] +; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]] +; + %load1 = load <vscale x 4 x i32>, ptr %p + store <vscale x 4 x i32> zeroinitializer, ptr %p2 + %load2 = load <vscale x 4 x i32>, ptr %p ; <- load to be eliminated + %add = add <vscale x 4 x i32> %load1, %load2 + ret <vscale x 4 x i32> %add +} + +; BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias. +define i32 @load_clobber_load_gep1(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep1( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 1 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %p, i64 1 + %load2 = load i32, ptr %gep2 ; <- load could be eliminated + %add = add i32 %load1, %load2 + ret i32 %add +} + +define i32 @load_clobber_load_gep2(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep2( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[P]], i64 4 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 0 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr i32, ptr %p, i64 4 + %load2 = load i32, ptr %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr + %add = add i32 %load1, %load2 + ret i32 %add +} + +; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias. +define i32 @load_clobber_load_gep3(ptr %p) { +; CHECK-LABEL: @load_clobber_load_gep3( +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <vscale x 4 x float>, ptr [[P]], i64 1, i64 0 +; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[CAST:%.*]] = bitcast float [[LOAD2]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[CAST]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 0 + %load1 = load i32, ptr %gep1 + %gep2 = getelementptr <vscale x 4 x float>, ptr %p, i64 1, i64 0 + %load2 = load float, ptr %gep2 ; <- load could be eliminated + %cast = bitcast float %load2 to i32 + %add = add i32 %load1, %cast + ret i32 %add +} + +define <vscale x 4 x i32> @load_clobber_load_fence(ptr %p) { +; CHECK-LABEL: @load_clobber_load_fence( +; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm "", "~{memory}"() +; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret <vscale x 4 x i32> [[SUB]] +; + %load1 = load <vscale x 4 x i32>, ptr %p + call void asm "", "~{memory}"() + %load2 = load <vscale x 4 x i32>, ptr %p + %sub = sub <vscale x 4 x i32> %load1, %load2 + ret <vscale x 4 x i32> %sub +} + +define <vscale x 4 x i32> @load_clobber_load_sideeffect(ptr %p) { +; CHECK-LABEL: @load_clobber_load_sideeffect( +; CHECK-NEXT: [[LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm sideeffect "", ""() +; CHECK-NEXT: [[LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]] +; + %load1 = load <vscale x 4 x i32>, ptr %p + call void asm sideeffect "", ""() + %load2 = load <vscale x 4 x i32>, ptr %p + %add = add <vscale x 4 x i32> %load1, %load2 + ret <vscale x 4 x i32> %add +} + +; Analyze Load from clobbering Store. + +define <vscale x 4 x i32> @store_forward_to_load(ptr %p) { +; CHECK-LABEL: @store_forward_to_load( +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer +; + store <vscale x 4 x i32> zeroinitializer, ptr %p + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x i32> @store_forward_to_load_sideeffect(ptr %p) { +; CHECK-LABEL: @store_forward_to_load_sideeffect( +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: call void asm sideeffect "", ""() +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]] +; + store <vscale x 4 x i32> zeroinitializer, ptr %p + call void asm sideeffect "", ""() + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define i32 @store_clobber_load() { +; CHECK-LABEL: @store_clobber_load( +; CHECK-NEXT: [[ALLOC:%.*]] = alloca <vscale x 4 x i32>, align 16 +; CHECK-NEXT: store <vscale x 4 x i32> undef, ptr [[ALLOC]], align 16 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[ALLOC]], i32 0, i32 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + %alloc = alloca <vscale x 4 x i32> + store <vscale x 4 x i32> undef, ptr %alloc + %ptr = getelementptr <vscale x 4 x i32>, ptr %alloc, i32 0, i32 1 + %load = load i32, ptr %ptr + ret i32 %load +} + +; Analyze Load from clobbering MemInst. + +declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) + +define i32 @memset_clobber_load(ptr %p) { +; CHECK-LABEL: @memset_clobber_load( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: ret i32 16843009 +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 5 + %load = load i32, ptr %gep + ret i32 %load +} + +define i32 @memset_clobber_load_vscaled_base(ptr %p) { +; CHECK-LABEL: @memset_clobber_load_vscaled_base( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P]], i64 1, i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 1 + %load = load i32, ptr %gep + ret i32 %load +} + +define i32 @memset_clobber_load_nonconst_index(ptr %p, i64 %idx1, i64 %idx2) { +; CHECK-LABEL: @memset_clobber_load_nonconst_index( +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P]], i64 [[IDX1:%.*]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) + %gep = getelementptr <vscale x 4 x i32>, ptr %p, i64 %idx1, i64 %idx2 + %load = load i32, ptr %gep + ret i32 %load +} + + +; Load elimination across BBs + +define ptr @load_from_alloc_replaced_with_undef() { +; CHECK-LABEL: @load_from_alloc_replaced_with_undef( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca <vscale x 4 x i32>, align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[A]], i64 0, i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[LOAD]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[A]], align 16 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret ptr [[A]] +; +entry: + %a = alloca <vscale x 4 x i32> + %gep = getelementptr <vscale x 4 x i32>, ptr %a, i64 0, i64 1 + %load = load i32, ptr %gep ; <- load to be eliminated + %tobool = icmp eq i32 %load, 0 ; <- icmp to be eliminated + br i1 %tobool, label %if.end, label %if.then + +if.then: + store <vscale x 4 x i32> zeroinitializer, ptr %a + br label %if.end + +if.end: + ret ptr %a +} + +define i32 @redundant_load_elimination_1(ptr %p) { +; CHECK-LABEL: @redundant_load_elimination_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret i32 [[LOAD1]] +; +entry: + %gep = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 1 + %load1 = load i32, ptr %gep + %cmp = icmp eq i32 %load1, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %load2 = load i32, ptr %gep ; <- load to be eliminated + %add = add i32 %load1, %load2 + br label %if.end + +if.end: + %result = phi i32 [ %add, %if.then ], [ %load1, %entry ] + ret i32 %result +} + +; TODO: BasicAA return MayAlias for %gep1,%gep2, could improve as NoAlias. +define void @redundant_load_elimination_2(i1 %c, ptr %p, ptr %q) { +; CHECK-LABEL: @redundant_load_elimination_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1, i64 1 +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P]], i64 1, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP2]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[T:%.*]] = load i32, ptr [[GEP1]], align 4 +; CHECK-NEXT: store i32 [[T]], ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 1 + store i32 0, ptr %gep1 + %gep2 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1, i64 0 + store i32 1, ptr %gep2 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} + +define void @redundant_load_elimination_zero_index(i1 %c, ptr %p, ptr %q) { +; CHECK-LABEL: @redundant_load_elimination_zero_index( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 1 + store i32 0, ptr %gep1 + store i32 1, ptr %p + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} + +define void @redundant_load_elimination_zero_index_1(i1 %c, ptr %p, ptr %q, i64 %i) { +; CHECK-LABEL: @redundant_load_elimination_zero_index_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[J:%.*]] = add i64 [[I:%.*]], 1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 0, i64 [[J]] +; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P]], i64 0, i64 [[I]] +; CHECK-NEXT: store i32 1, ptr [[GEP2]], align 4 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + %j = add i64 %i, 1 + %gep1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 %j + store i32 0, ptr %gep1 + %gep2 = getelementptr <vscale x 4 x i32>, ptr %p, i64 0, i64 %i + store i32 1, ptr %gep2 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load i32, ptr %gep1 ; <- load could be eliminated + store i32 %t, ptr %q + ret void + +if.else: + ret void +} +; TODO: load in if.then could have been eliminated +define void @missing_load_elimination(i1 %c, ptr %p, ptr %q, <vscale x 4 x i32> %v) { +; CHECK-LABEL: @missing_load_elimination( +; CHECK-NEXT: entry: +; CHECK-NEXT: store <vscale x 4 x i32> zeroinitializer, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[P1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: store <vscale x 4 x i32> [[V:%.*]], ptr [[P1]], align 16 +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[T:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: store <vscale x 4 x i32> [[T]], ptr [[Q:%.*]], align 16 +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: ret void +; +entry: + store <vscale x 4 x i32> zeroinitializer, ptr %p + %p1 = getelementptr <vscale x 4 x i32>, ptr %p, i64 1 + store <vscale x 4 x i32> %v, ptr %p1 + br i1 %c, label %if.else, label %if.then + +if.then: + %t = load <vscale x 4 x i32>, ptr %p ; load could be eliminated + store <vscale x 4 x i32> %t, ptr %q + ret void + +if.else: + ret void +} + +; Different sizes / types + +define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <vscale x 4 x float> @load_v4f32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 4 x float>, ptr %p + ret <vscale x 4 x float> %load +} + +define <vscale x 4 x float> @load_v4f32_store_v16i8_forward_load(ptr %p, <vscale x 16 x i8> %x) { +; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( +; CHECK-NEXT: store <vscale x 16 x i8> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x float>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD]] +; + store <vscale x 16 x i8> %x, ptr %p + %load = load <vscale x 4 x float>, ptr %p + ret <vscale x 4 x float> %load +} + +define <vscale x 4 x i32> @load_v4i32_store_v4f32_forward_load(ptr %p, <vscale x 4 x float> %x) { +; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( +; CHECK-NEXT: store <vscale x 4 x float> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]] +; + store <vscale x 4 x float> %x, ptr %p + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x i32> @load_v4i32_store_v4i64_forward_load(ptr %p, <vscale x 4 x i64> %x) { +; CHECK-LABEL: @load_v4i32_store_v4i64_forward_load( +; CHECK-NEXT: store <vscale x 4 x i64> [[X:%.*]], ptr [[P:%.*]], align 32 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 4 x i32> [[LOAD]] +; + store <vscale x 4 x i64> %x, ptr %p + %load = load <vscale x 4 x i32>, ptr %p + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x i64> @load_v4i64_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v4i64_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[P]], align 32 +; CHECK-NEXT: ret <vscale x 4 x i64> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 4 x i64>, ptr %p + ret <vscale x 4 x i64> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[P]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 2 x i32>, ptr %p + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsets(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsets( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %q = getelementptr <vscale x 2 x i32>, ptr %p, i64 1 + %load = load <vscale x 2 x i32>, ptr %q + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x i32> @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2i32_store_v4i32_forward_load_offsetc( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[Q:%.*]] = getelementptr <2 x i32>, ptr [[P]], i64 1 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[Q]], align 8 +; CHECK-NEXT: ret <vscale x 2 x i32> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %q = getelementptr <2 x i32>, ptr %p, i64 1 + %load = load <vscale x 2 x i32>, ptr %q + ret <vscale x 2 x i32> %load +} + +define <vscale x 2 x ptr> @load_v2p0_store_v4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 2 x ptr> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <vscale x 2 x ptr>, ptr %p + ret <vscale x 2 x ptr> %load +} + +define <vscale x 2 x i64> @load_v2i64_store_v2p0_forward_load(ptr %p, <vscale x 2 x ptr> %x) { +; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( +; CHECK-NEXT: store <vscale x 2 x ptr> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 2 x i64> [[LOAD]] +; + store <vscale x 2 x ptr> %x, ptr %p + %load = load <vscale x 2 x i64>, ptr %p + ret <vscale x 2 x i64> %load +} + +define <vscale x 16 x i8> @load_nxv16i8_store_v4i32_forward_load(ptr %p, <4 x i32> %x) { +; CHECK-LABEL: @load_nxv16i8_store_v4i32_forward_load( +; CHECK-NEXT: store <4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <4 x i32> %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_nxv4i32_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load <16 x i8>, ptr %p + ret <16 x i8> %load +} + +define <vscale x 16 x i8> @load_v16i8_store_v4i32_forward_constant(ptr %p) { +; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( +; CHECK-NEXT: store <vscale x 4 x i32> splat (i32 4), ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store <vscale x 4 x i32> splat (i32 4), ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define <vscale x 16 x i8> @load_v16i8_struct_store_v4i32_forward_load(ptr %p, { <vscale x 4 x i32> } %x) { +; CHECK-LABEL: @load_v16i8_struct_store_v4i32_forward_load( +; CHECK-NEXT: store { <vscale x 4 x i32> } [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[P]], align 16 +; CHECK-NEXT: ret <vscale x 16 x i8> [[LOAD]] +; + store { <vscale x 4 x i32> } %x, ptr %p + %load = load <vscale x 16 x i8>, ptr %p + ret <vscale x 16 x i8> %load +} + +define {<vscale x 16 x i8>} @load_v16i8_store_v4i32_struct_forward_load(ptr %p, <vscale x 4 x i32> %x) { +; CHECK-LABEL: @load_v16i8_store_v4i32_struct_forward_load( +; CHECK-NEXT: store <vscale x 4 x i32> [[X:%.*]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[LOAD:%.*]] = load { <vscale x 16 x i8> }, ptr [[P]], align 16 +; CHECK-NEXT: ret { <vscale x 16 x i8> } [[LOAD]] +; + store <vscale x 4 x i32> %x, ptr %p + %load = load { <vscale x 16 x i8> }, ptr %p + ret { <vscale x 16 x i8> } %load +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @bigexample({ <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a) vscale_range(1,16) { +; CHECK-LABEL: @bigexample( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A:%.*]], 0 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT]], ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 1 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5 +; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 2 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48 +; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]] +; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[A]], 3 +; CHECK-NEXT: store <vscale x 4 x i32> [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[DOTUNPACK:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[DOTUNPACK]], 0 +; CHECK-NEXT: [[DOTUNPACK8:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK1]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP6]], <vscale x 16 x i8> [[DOTUNPACK8]], 1 +; CHECK-NEXT: [[DOTUNPACK10:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK3]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP9]], <vscale x 16 x i8> [[DOTUNPACK10]], 2 +; CHECK-NEXT: [[DOTUNPACK12:%.*]] = load <vscale x 16 x i8>, ptr [[REF_TMP_REPACK5]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP12]], <vscale x 16 x i8> [[DOTUNPACK12]], 3 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP15]] +; +entry: + %ref.tmp = alloca { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }, align 16 + call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %ref.tmp) + %a.elt = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 0 + store <vscale x 4 x i32> %a.elt, ptr %ref.tmp, align 16 + %0 = call i64 @llvm.vscale.i64() + %1 = shl i64 %0, 4 + %ref.tmp.repack1 = getelementptr inbounds i8, ptr %ref.tmp, i64 %1 + %a.elt2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 1 + store <vscale x 4 x i32> %a.elt2, ptr %ref.tmp.repack1, align 16 + %2 = call i64 @llvm.vscale.i64() + %3 = shl i64 %2, 5 + %ref.tmp.repack3 = getelementptr inbounds i8, ptr %ref.tmp, i64 %3 + %a.elt4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 2 + store <vscale x 4 x i32> %a.elt4, ptr %ref.tmp.repack3, align 16 + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 48 + %ref.tmp.repack5 = getelementptr inbounds i8, ptr %ref.tmp, i64 %5 + %a.elt6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %a, 3 + store <vscale x 4 x i32> %a.elt6, ptr %ref.tmp.repack5, align 16 + %.unpack = load <vscale x 16 x i8>, ptr %ref.tmp, align 16 + %6 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %.unpack, 0 + %7 = call i64 @llvm.vscale.i64() + %8 = shl i64 %7, 4 + %.elt7 = getelementptr inbounds i8, ptr %ref.tmp, i64 %8 + %.unpack8 = load <vscale x 16 x i8>, ptr %.elt7, align 16 + %9 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, <vscale x 16 x i8> %.unpack8, 1 + %10 = call i64 @llvm.vscale.i64() + %11 = shl i64 %10, 5 + %.elt9 = getelementptr inbounds i8, ptr %ref.tmp, i64 %11 + %.unpack10 = load <vscale x 16 x i8>, ptr %.elt9, align 16 + %12 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %9, <vscale x 16 x i8> %.unpack10, 2 + %13 = call i64 @llvm.vscale.i64() + %14 = mul i64 %13, 48 + %.elt11 = getelementptr inbounds i8, ptr %ref.tmp, i64 %14 + %.unpack12 = load <vscale x 16 x i8>, ptr %.elt11, align 16 + %15 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %12, <vscale x 16 x i8> %.unpack12, 3 + call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %15 +}