[LV] Add additional tests for replicating calls returning structs. Add additional test coverage for replicating calls return structs, in particular cases where the number of struct elements does not match the VF. Extra test coverage for https://github.com/llvm/llvm-project/pull/142433.
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll new file mode 100644 index 0000000..fe53334 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -0,0 +1,484 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "middle.block:" --version 5 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF2IC2 %s + +define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonly %out_a) { +; VF4-LABEL: define void @struct_return_1xi64_replicate( +; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 +; VF4-NEXT: [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]] +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]] +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 +; VF4-NEXT: [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]] +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]] +; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i32 0 +; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i64> } poison, <4 x i64> [[TMP11]], 0 +; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i64 } [[TMP5]], 0 +; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i64> } [[TMP12]], 0 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP13]], i32 1 +; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i64> } [[TMP12]], <4 x i64> [[TMP15]], 0 +; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i64 } [[TMP7]], 0 +; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i64> } [[TMP16]], 0 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP17]], i32 2 +; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i64> } [[TMP16]], <4 x i64> [[TMP19]], 0 +; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP9]], 0 +; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i64> } [[TMP20]], 0 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP21]], i32 3 +; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0 +; VF4-NEXT: [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0 +; VF4-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]] +; VF4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i32 0 +; VF4-NEXT: store <4 x i64> [[TMP25]], ptr [[TMP27]], align 4 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; +; VF2IC2-LABEL: define void @struct_return_1xi64_replicate( +; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]] +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0 +; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i64 } [[TMP6]], 0 +; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1 +; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0 +; VF2IC2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]] +; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0 +; VF2IC2-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i32 0 +; VF2IC2-NEXT: [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0 +; VF2IC2-NEXT: [[TMP21:%.*]] = extractvalue { i64 } [[TMP17]], 0 +; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { <2 x i64> } [[TMP20]], 0 +; VF2IC2-NEXT: [[TMP23:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP21]], i32 1 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertvalue { <2 x i64> } [[TMP20]], <2 x i64> [[TMP23]], 0 +; VF2IC2-NEXT: [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0 +; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0 +; VF2IC2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 0 +; VF2IC2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 2 +; VF2IC2-NEXT: store <2 x i64> [[TMP25]], ptr [[TMP28]], align 4 +; VF2IC2-NEXT: store <2 x i64> [[TMP26]], ptr [[TMP29]], align 4 +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { i64 } @fn1(float %in_val) #0 + %extract_a = extractvalue { i64 } %call, 0 + %arrayidx2 = getelementptr inbounds i64, ptr %out_a, i64 %iv + store i64 %extract_a, ptr %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; VF4-LABEL: define void @struct_return_2xf32_replicate( +; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 +; VF4-NEXT: [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]] +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]] +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 +; VF4-NEXT: [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]] +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]] +; VF4-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0 +; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP11]], 0 +; VF4-NEXT: [[TMP13:%.*]] = extractvalue { float, float } [[TMP3]], 1 +; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 1 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP13]], i32 0 +; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP15]], 1 +; VF4-NEXT: [[TMP17:%.*]] = extractvalue { float, float } [[TMP5]], 0 +; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP16]], 0 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP17]], i32 1 +; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP16]], <4 x float> [[TMP19]], 0 +; VF4-NEXT: [[TMP21:%.*]] = extractvalue { float, float } [[TMP5]], 1 +; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP20]], 1 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP20]], <4 x float> [[TMP23]], 1 +; VF4-NEXT: [[TMP25:%.*]] = extractvalue { float, float } [[TMP7]], 0 +; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0 +; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP25]], i32 2 +; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP27]], 0 +; VF4-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP7]], 1 +; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP28]], 1 +; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP29]], i32 2 +; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP28]], <4 x float> [[TMP31]], 1 +; VF4-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP9]], 0 +; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP32]], 0 +; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP33]], i32 3 +; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP32]], <4 x float> [[TMP35]], 0 +; VF4-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP9]], 1 +; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 1 +; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP37]], i32 3 +; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP39]], 1 +; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0 +; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1 +; VF4-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; VF4-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0 +; VF4-NEXT: store <4 x float> [[TMP41]], ptr [[TMP44]], align 4 +; VF4-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; VF4-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0 +; VF4-NEXT: store <4 x float> [[TMP42]], ptr [[TMP46]], align 4 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF4-NEXT: br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; +; VF2IC2-LABEL: define void @struct_return_2xf32_replicate( +; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0 +; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]] +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0 +; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { float, float } [[TMP4]], 1 +; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP9]], 1 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP10]], i32 0 +; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP9]], <2 x float> [[TMP12]], 1 +; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { float, float } [[TMP6]], 0 +; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP13]], 0 +; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP14]], i32 1 +; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP13]], <2 x float> [[TMP16]], 0 +; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { float, float } [[TMP6]], 1 +; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1 +; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i32 1 +; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1 +; VF2IC2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]] +; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0 +; VF2IC2-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i32 0 +; VF2IC2-NEXT: [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0 +; VF2IC2-NEXT: [[TMP29:%.*]] = extractvalue { float, float } [[TMP23]], 1 +; VF2IC2-NEXT: [[TMP30:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP28]], 1 +; VF2IC2-NEXT: [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i32 0 +; VF2IC2-NEXT: [[TMP32:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP28]], <2 x float> [[TMP31]], 1 +; VF2IC2-NEXT: [[TMP33:%.*]] = extractvalue { float, float } [[TMP25]], 0 +; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP32]], 0 +; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x float> [[TMP34]], float [[TMP33]], i32 1 +; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP32]], <2 x float> [[TMP35]], 0 +; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { float, float } [[TMP25]], 1 +; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP36]], 1 +; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i32 1 +; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP36]], <2 x float> [[TMP39]], 1 +; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 0 +; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 0 +; VF2IC2-NEXT: [[TMP43:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 1 +; VF2IC2-NEXT: [[TMP44:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 1 +; VF2IC2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0 +; VF2IC2-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 2 +; VF2IC2-NEXT: store <2 x float> [[TMP41]], ptr [[TMP46]], align 4 +; VF2IC2-NEXT: store <2 x float> [[TMP42]], ptr [[TMP47]], align 4 +; VF2IC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 +; VF2IC2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 2 +; VF2IC2-NEXT: store <2 x float> [[TMP43]], ptr [[TMP49]], align 4 +; VF2IC2-NEXT: store <2 x float> [[TMP44]], ptr [[TMP50]], align 4 +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @fn2(float %in_val) #1 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + + +define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonly %dst.a, ptr noalias %dst.b, ptr noalias %dst.c) { +; VF4-LABEL: define void @struct_return_3xi32_replicate( +; VF4-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] +; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0 +; VF4-NEXT: [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]] +; VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1 +; VF4-NEXT: [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]] +; VF4-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2 +; VF4-NEXT: [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]] +; VF4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]] +; VF4-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0 +; VF4-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 +; VF4-NEXT: [[TMP12:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } poison, <4 x i32> [[TMP11]], 0 +; VF4-NEXT: [[TMP13:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 1 +; VF4-NEXT: [[TMP14:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], 1 +; VF4-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i32 0 +; VF4-NEXT: [[TMP16:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP12]], <4 x i32> [[TMP15]], 1 +; VF4-NEXT: [[TMP17:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 2 +; VF4-NEXT: [[TMP18:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], 2 +; VF4-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP17]], i32 0 +; VF4-NEXT: [[TMP20:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP16]], <4 x i32> [[TMP19]], 2 +; VF4-NEXT: [[TMP21:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 0 +; VF4-NEXT: [[TMP22:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], 0 +; VF4-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP21]], i32 1 +; VF4-NEXT: [[TMP24:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP20]], <4 x i32> [[TMP23]], 0 +; VF4-NEXT: [[TMP25:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 1 +; VF4-NEXT: [[TMP26:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], 1 +; VF4-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP25]], i32 1 +; VF4-NEXT: [[TMP28:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP24]], <4 x i32> [[TMP27]], 1 +; VF4-NEXT: [[TMP29:%.*]] = extractvalue { i32, i32, i32 } [[TMP5]], 2 +; VF4-NEXT: [[TMP30:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], 2 +; VF4-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i32 1 +; VF4-NEXT: [[TMP32:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP28]], <4 x i32> [[TMP31]], 2 +; VF4-NEXT: [[TMP33:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 0 +; VF4-NEXT: [[TMP34:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], 0 +; VF4-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP33]], i32 2 +; VF4-NEXT: [[TMP36:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP32]], <4 x i32> [[TMP35]], 0 +; VF4-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 1 +; VF4-NEXT: [[TMP38:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], 1 +; VF4-NEXT: [[TMP39:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP37]], i32 2 +; VF4-NEXT: [[TMP40:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP36]], <4 x i32> [[TMP39]], 1 +; VF4-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP7]], 2 +; VF4-NEXT: [[TMP42:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], 2 +; VF4-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP41]], i32 2 +; VF4-NEXT: [[TMP44:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP40]], <4 x i32> [[TMP43]], 2 +; VF4-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 0 +; VF4-NEXT: [[TMP46:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], 0 +; VF4-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> [[TMP46]], i32 [[TMP45]], i32 3 +; VF4-NEXT: [[TMP48:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP44]], <4 x i32> [[TMP47]], 0 +; VF4-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 1 +; VF4-NEXT: [[TMP50:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], 1 +; VF4-NEXT: [[TMP51:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP49]], i32 3 +; VF4-NEXT: [[TMP52:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP48]], <4 x i32> [[TMP51]], 1 +; VF4-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP9]], 2 +; VF4-NEXT: [[TMP54:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], 2 +; VF4-NEXT: [[TMP55:%.*]] = insertelement <4 x i32> [[TMP54]], i32 [[TMP53]], i32 3 +; VF4-NEXT: [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2 +; VF4-NEXT: [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0 +; VF4-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]] +; VF4-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 0 +; VF4-NEXT: store <4 x i32> [[TMP57]], ptr [[TMP59]], align 4 +; VF4-NEXT: [[TMP60:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 1 +; VF4-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]] +; VF4-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 0 +; VF4-NEXT: store <4 x i32> [[TMP60]], ptr [[TMP62]], align 4 +; VF4-NEXT: [[TMP63:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 2 +; VF4-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]] +; VF4-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0 +; VF4-NEXT: store <4 x i32> [[TMP63]], ptr [[TMP65]], align 4 +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF4-NEXT: br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; +; VF2IC2-LABEL: define void @struct_return_3xi32_replicate( +; VF2IC2-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[DST_A:%.*]], ptr noalias [[DST_B:%.*]], ptr noalias [[DST_C:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; VF2IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 +; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; VF2IC2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]] +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; VF2IC2-NEXT: [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0 +; VF2IC2-NEXT: [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 1 +; VF2IC2-NEXT: [[TMP11:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], 1 +; VF2IC2-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 0 +; VF2IC2-NEXT: [[TMP13:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], <2 x i32> [[TMP12]], 1 +; VF2IC2-NEXT: [[TMP14:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 2 +; VF2IC2-NEXT: [[TMP15:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], 2 +; VF2IC2-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 0 +; VF2IC2-NEXT: [[TMP17:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP13]], <2 x i32> [[TMP16]], 2 +; VF2IC2-NEXT: [[TMP18:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 0 +; VF2IC2-NEXT: [[TMP19:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], 0 +; VF2IC2-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i32 1 +; VF2IC2-NEXT: [[TMP21:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP17]], <2 x i32> [[TMP20]], 0 +; VF2IC2-NEXT: [[TMP22:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 1 +; VF2IC2-NEXT: [[TMP23:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], 1 +; VF2IC2-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP22]], i32 1 +; VF2IC2-NEXT: [[TMP25:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP21]], <2 x i32> [[TMP24]], 1 +; VF2IC2-NEXT: [[TMP26:%.*]] = extractvalue { i32, i32, i32 } [[TMP6]], 2 +; VF2IC2-NEXT: [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2 +; VF2IC2-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i32 1 +; VF2IC2-NEXT: [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2 +; VF2IC2-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]] +; VF2IC2-NEXT: [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0 +; VF2IC2-NEXT: [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0 +; VF2IC2-NEXT: [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0 +; VF2IC2-NEXT: [[TMP37:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 1 +; VF2IC2-NEXT: [[TMP38:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], 1 +; VF2IC2-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP37]], i32 0 +; VF2IC2-NEXT: [[TMP40:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP36]], <2 x i32> [[TMP39]], 1 +; VF2IC2-NEXT: [[TMP41:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 2 +; VF2IC2-NEXT: [[TMP42:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], 2 +; VF2IC2-NEXT: [[TMP43:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[TMP41]], i32 0 +; VF2IC2-NEXT: [[TMP44:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP40]], <2 x i32> [[TMP43]], 2 +; VF2IC2-NEXT: [[TMP45:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 0 +; VF2IC2-NEXT: [[TMP46:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], 0 +; VF2IC2-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> [[TMP46]], i32 [[TMP45]], i32 1 +; VF2IC2-NEXT: [[TMP48:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP44]], <2 x i32> [[TMP47]], 0 +; VF2IC2-NEXT: [[TMP49:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 1 +; VF2IC2-NEXT: [[TMP50:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], 1 +; VF2IC2-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP50]], i32 [[TMP49]], i32 1 +; VF2IC2-NEXT: [[TMP52:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP48]], <2 x i32> [[TMP51]], 1 +; VF2IC2-NEXT: [[TMP53:%.*]] = extractvalue { i32, i32, i32 } [[TMP33]], 2 +; VF2IC2-NEXT: [[TMP54:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], 2 +; VF2IC2-NEXT: [[TMP55:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP53]], i32 1 +; VF2IC2-NEXT: [[TMP56:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP52]], <2 x i32> [[TMP55]], 2 +; VF2IC2-NEXT: [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0 +; VF2IC2-NEXT: [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0 +; VF2IC2-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 0 +; VF2IC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2 +; VF2IC2-NEXT: store <2 x i32> [[TMP57]], ptr [[TMP60]], align 4 +; VF2IC2-NEXT: store <2 x i32> [[TMP58]], ptr [[TMP61]], align 4 +; VF2IC2-NEXT: [[TMP62:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 1 +; VF2IC2-NEXT: [[TMP63:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 1 +; VF2IC2-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0 +; VF2IC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 2 +; VF2IC2-NEXT: store <2 x i32> [[TMP62]], ptr [[TMP65]], align 4 +; VF2IC2-NEXT: store <2 x i32> [[TMP63]], ptr [[TMP66]], align 4 +; VF2IC2-NEXT: [[TMP67:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 2 +; VF2IC2-NEXT: [[TMP68:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 2 +; VF2IC2-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 0 +; VF2IC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2 +; VF2IC2-NEXT: store <2 x i32> [[TMP67]], ptr [[TMP70]], align 4 +; VF2IC2-NEXT: store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4 +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, ptr %in, i64 %iv + %in_val = load i32, ptr %arrayidx, align 4 + %call = tail call { i32, i32, i32 } @fn3(i32 %in_val) #2 + %extract_a = extractvalue { i32, i32, i32 } %call, 0 + %gep.dst.a = getelementptr inbounds i32, ptr %dst.a, i64 %iv + store i32 %extract_a, ptr %gep.dst.a, align 4 + %extract_b = extractvalue { i32, i32, i32 } %call, 1 + %gep.dst.b = getelementptr inbounds i32, ptr %dst.b, i64 %iv + store i32 %extract_b, ptr %gep.dst.b, align 4 + %extract_c = extractvalue { i32, i32, i32 } %call, 2 + %gep.dst.c = getelementptr inbounds i32, ptr %dst.c, i64 %iv + store i32 %extract_c, ptr %gep.dst.c, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +declare { i64 } @fn1(float) +declare { float, float } @fn2(float) +declare { i32, i32, i32 } @fn3(i32) + +declare { <8 x i64> } @fixed_vec_fn1(<8 x float>) +declare { <8 x float>, <8 x float> } @fixed_vec_fn2(<8 x float>) +declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" } +attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index 50b9ba1..6d849c0 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -70,59 +70,6 @@ } ; CHECK-REMARKS: remark: {{.*}} vectorized loop -; Note: Later instcombines reduce this down quite a lot. -define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_replicate -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}}) -; CHECK: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}}) -; // Lane 0 -; CHECK: [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 -; CHECK: [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK: [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0 -; CHECK: [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1 -; CHECK: [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1 -; CHECK: [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0 -; CHECK: [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1 -; // Lane 1 -; CHECK: [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0 -; CHECK: [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0 -; CHECK: [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1 -; CHECK: [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0 -; CHECK: [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1 -; CHECK: [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1 -; CHECK: [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1 -; CHECK: [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1 -; // Store wide values: -; CHECK: [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0 -; CHECK: [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1 -; CHECK: store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4 -; CHECK: store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4 -entry: - br label %for.body - -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv - %in_val = load float, ptr %arrayidx, align 4 - ; #3 does not have a fixed-size vector mapping (so replication is used) - %call = tail call { float, float } @foo(float %in_val) #3 - %extract_a = extractvalue { float, float } %call, 0 - %extract_b = extractvalue { float, float } %call, 1 - %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv - store float %extract_a, ptr %arrayidx2, align 4 - %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv - store float %extract_b, ptr %arrayidx4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 1024 - br i1 %exitcond.not, label %exit, label %for.body - -exit: - ret void -} - -; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks ; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])