blob: 419cedbe145b5c571db930b27d65ce0ac7d0b4ff [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc_linux"
; The source code:
;
;void foo1(int *A, int *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
; Function Attrs: nounwind uwtable
define void @foo1(i32* %A, i32* %B, i32* %trigger) {
; AVX1-LABEL: @foo1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
; AVX1-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !3
; AVX1-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP5]], <8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !5, !noalias !7
; AVX1-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0
; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
; AVX1-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP13]], <8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
; AVX1-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
; AVX1-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
;
; AVX2-LABEL: @foo1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 8
; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 24
; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24
; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 8
; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 24
; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0
; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8
; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24
; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3
; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 8
; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 24
; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
; AVX2: for.body.preheader:
; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
; AVX2-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
; AVX2-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
; AVX2-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
;
; AVX512-LABEL: @foo1(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32
; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32
; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48
; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 32
; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 48
; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0
; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32
; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48
; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3
; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 32
; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 48
; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
; AVX512: for.body.preheader:
; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
; AVX512-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
; AVX512-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
; AVX512-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
;
entry:
%A.addr = alloca i32*, align 8
%B.addr = alloca i32*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store i32* %A, i32** %A.addr, align 8
store i32* %B, i32** %B.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load i32*, i32** %B.addr, align 8
%arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
%6 = load i32, i32* %arrayidx3, align 4
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32*, i32** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
%9 = load i32, i32* %arrayidx5, align 4
%add = add nsw i32 %6, %9
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load i32*, i32** %A.addr, align 8
%arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
store i32 %add, i32* %arrayidx7, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The same as @foo1 but all the pointers are address space 1 pointers.
; Function Attrs: nounwind uwtable
define void @foo1_addrspace1(i32 addrspace(1)* %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) {
; AVX1-LABEL: @foo1_addrspace1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
; AVX1-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32 addrspace(1)* [[TMP3]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !14
; AVX1-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP5]], <8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !16, !noalias !18
; AVX1-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11
; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
; AVX1-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
; AVX1-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP13]], <8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
; AVX1-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP17:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP19:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP20:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
; AVX1-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
;
; AVX2-LABEL: @foo1_addrspace1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 8
; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 24
; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 8
; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 24
; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 8
; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 24
; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 8
; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 24
; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14
; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 8
; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 24
; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
; AVX2: for.body.preheader:
; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
; AVX2-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
; AVX2-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
; AVX2-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
;
; AVX512-LABEL: @foo1_addrspace1(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 32
; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 48
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 32
; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 48
; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 32
; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 48
; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 32
; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 48
; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14
; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 32
; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 48
; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
; AVX512: for.body.preheader:
; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
; AVX512-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
; AVX512-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
; AVX512-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
;
entry:
%A.addr = alloca i32 addrspace(1)*, align 8
%B.addr = alloca i32 addrspace(1)*, align 8
%trigger.addr = alloca i32 addrspace(1)*, align 8
%i = alloca i32, align 4
store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8
store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8
store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
%3 = load i32, i32 addrspace(1)* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2
%6 = load i32, i32 addrspace(1)* %arrayidx3, align 4
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4
%9 = load i32, i32 addrspace(1)* %arrayidx5, align 4
%add = add nsw i32 %6, %9
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8
%arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6
store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The source code:
;
;void foo2(float *A, float *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
; Function Attrs: nounwind uwtable
define void @foo2(float* %A, float* %B, i32* %trigger) {
; AVX1-LABEL: @foo2(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
; AVX1-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
; AVX1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
; AVX1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
; AVX1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
; AVX1-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX1-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
; AVX1-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
; AVX1-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
; AVX1-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX1-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX1-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX1-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
; AVX1-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
; AVX1-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
; AVX1-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
; AVX1: for.body.preheader:
; AVX1-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
; AVX1-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
;
; AVX2-LABEL: @foo2(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
; AVX2-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
; AVX2-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
; AVX2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
; AVX2-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX2-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
; AVX2-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
; AVX2-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
; AVX2-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX2-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX2-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX2-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
; AVX2: for.body.preheader:
; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
; AVX2-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
; AVX2-NEXT: [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
; AVX2-NEXT: [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
;
; AVX512-LABEL: @foo2(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32
; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48
; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24
; AVX512-NEXT: [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
; AVX512-NEXT: [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float>
; AVX512-NEXT: [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float>
; AVX512-NEXT: [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float>
; AVX512-NEXT: [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX512-NEXT: [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 32
; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 48
; AVX512-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
; AVX512: for.body.preheader:
; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
;
entry:
%A.addr = alloca float*, align 8
%B.addr = alloca float*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store float* %A, float** %A.addr, align 8
store float* %B, float** %B.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load float*, float** %B.addr, align 8
%arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
%6 = load float, float* %arrayidx3, align 4
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32*, i32** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
%9 = load i32, i32* %arrayidx5, align 4
%conv = sitofp i32 %9 to float
%add = fadd float %6, %conv
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load float*, float** %A.addr, align 8
%arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
store float %add, float* %arrayidx7, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The source code:
;
;void foo3(double *A, double *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
; Function Attrs: nounwind uwtable
define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
; AVX1-LABEL: @foo3(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
; AVX1-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
; AVX1-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
; AVX1-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
; AVX1-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
; AVX1-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
; AVX1-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX1-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX1-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX1-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
; AVX1-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
; AVX1-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
; AVX1-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
; AVX1-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
;
; AVX2-LABEL: @foo3(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
; AVX2-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
; AVX2-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
; AVX2-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
; AVX2-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
; AVX2-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX2-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX2-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX2-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
; AVX2-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
; AVX2-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
; AVX2-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
;
; AVX512-LABEL: @foo3(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16
; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24
; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34
; AVX512-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
; AVX512-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double>
; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double>
; AVX512-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double>
; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
; AVX512-NEXT: [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
; AVX512-NEXT: [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
; AVX512-NEXT: [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 16
; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 24
; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39
; AVX512: for.body.preheader:
; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !40
;
entry:
%A.addr = alloca double*, align 8
%B.addr = alloca double*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store double* %A, double** %A.addr, align 8
store double* %B, double** %B.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load double*, double** %B.addr, align 8
%arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
%6 = load double, double* %arrayidx3, align 8
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32*, i32** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
%9 = load i32, i32* %arrayidx5, align 4
%conv = sitofp i32 %9 to double
%add = fadd double %6, %conv
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load double*, double** %A.addr, align 8
%arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
store double %add, double* %arrayidx7, align 8
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The source code:
;
;void foo4(double *A, double *B, int *trigger) {
;
; for (int i=0; i<10000; i += 16) {
; if (trigger[i] < 100) {
; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
; }
; }
;}
; Function Attrs: nounwind uwtable
define void @foo4(double* %A, double* %B, i32* %trigger) {
; AVX1-LABEL: @foo4(
; AVX1-NEXT: entry:
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
; AVX1-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
; AVX1-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: for.body.1:
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: if.then.1:
; AVX1-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
; AVX1-NEXT: [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 32
; AVX1-NEXT: br label [[FOR_BODY]]
;
; AVX2-LABEL: @foo4(
; AVX2-NEXT: entry:
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
; AVX2-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
; AVX2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: for.body.1:
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: if.then.1:
; AVX2-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
; AVX2-NEXT: [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP6]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
; AVX2-NEXT: [[TMP8:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP6]] to double
; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP8]], [[CONV_2]]
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP9]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
; AVX2-NEXT: [[TMP11:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP9]] to double
; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP11]], [[CONV_3]]
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64
; AVX2-NEXT: br label [[FOR_BODY]]
;
; AVX512-LABEL: @foo4(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 9985
; AVX512-NEXT: [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 9985
; AVX512-NEXT: [[SCEVGEP15:%.*]] = getelementptr double, double* [[B:%.*]], i64 19969
; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP12]] to double*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND017:%.*]] = icmp ugt double* [[SCEVGEP15]], [[A]]
; AVX512-NEXT: [[BOUND118:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
; AVX512-NEXT: [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[ENTRY]] ]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
; AVX512-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP4:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP4]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER20:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP5]], i32 8, <8 x i1> [[TMP3]], <8 x double> undef), !alias.scope !44
; AVX512-NEXT: [[TMP6:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
; AVX512-NEXT: [[TMP7:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20]], [[TMP6]]
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP7]], <8 x double*> [[TMP8]], i32 8, <8 x i1> [[TMP3]]), !alias.scope !46, !noalias !48
; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP11]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER20_1:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP12]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !44
; AVX512-NEXT: [[TMP13:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_1]] to <8 x double>
; AVX512-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_1]], [[TMP13]]
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT]]
; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP14]], <8 x double*> [[TMP15]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !46, !noalias !48
; AVX512-NEXT: [[VEC_IND_NEXT_1:%.*]] = add <8 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT_1]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_2]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP18:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT_1]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP18]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER20_2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP19]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !44
; AVX512-NEXT: [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_2]] to <8 x double>
; AVX512-NEXT: [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]]
; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48
; AVX512-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
; AVX512-NEXT: [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], <i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384>
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624
; AVX512-NEXT: br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49
; AVX512: for.body.preheader:
; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP24:%.*]] = sub nuw nsw i64 9999, [[INDVARS_IV_PH]]
; AVX512-NEXT: br label [[FOR_BODY_PROL:%.*]]
; AVX512: for.body.prol:
; AVX512-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]]
; AVX512-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4
; AVX512-NEXT: [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100
; AVX512-NEXT: br i1 [[CMP1_PROL]], label [[IF_THEN_PROL:%.*]], label [[FOR_INC_PROL]]
; AVX512: if.then.prol:
; AVX512-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[INDVARS_IV_PROL]], 1
; AVX512-NEXT: [[ARRAYIDX3_PROL:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP26]]
; AVX512-NEXT: [[TMP27:%.*]] = load double, double* [[ARRAYIDX3_PROL]], align 8
; AVX512-NEXT: [[CONV_PROL:%.*]] = sitofp i32 [[TMP25]] to double
; AVX512-NEXT: [[ADD_PROL:%.*]] = fadd double [[TMP27]], [[CONV_PROL]]
; AVX512-NEXT: [[ARRAYIDX7_PROL:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_PROL]]
; AVX512-NEXT: store double [[ADD_PROL]], double* [[ARRAYIDX7_PROL]], align 8
; AVX512-NEXT: br label [[FOR_INC_PROL]]
; AVX512: for.inc.prol:
; AVX512-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16
; AVX512-NEXT: [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
; AVX512-NEXT: [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
; AVX512-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50
; AVX512: for.body.prol.loopexit:
; AVX512-NEXT: [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984
; AVX512-NEXT: [[TMP28:%.*]] = icmp eq i64 [[DOTMASK]], 0
; AVX512-NEXT: br i1 [[TMP28]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL_LOOPEXIT]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[TMP30:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP30]]
; AVX512-NEXT: [[TMP31:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP31]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP32]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[TMP33:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP33]]
; AVX512-NEXT: [[TMP34:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP32]] to double
; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP34]], [[CONV_1]]
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], 32
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP35]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[TMP36:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP36]]
; AVX512-NEXT: [[TMP37:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP35]] to double
; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP37]], [[CONV_2]]
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], 48
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP38]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[TMP39:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP39]]
; AVX512-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP38]] to double
; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP40]], [[CONV_3]]
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 64
; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 10000
; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !52
;
entry:
%A.addr = alloca double*, align 8
%B.addr = alloca double*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store double* %A, double** %A.addr, align 8
store double* %B, double** %B.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%mul = mul nsw i32 %4, 2
%idxprom2 = sext i32 %mul to i64
%5 = load double*, double** %B.addr, align 8
%arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
%6 = load double, double* %arrayidx3, align 8
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32*, i32** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
%9 = load i32, i32* %arrayidx5, align 4
%conv = sitofp i32 %9 to double
%add = fadd double %6, %conv
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load double*, double** %A.addr, align 8
%arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
store double %add, double* %arrayidx7, align 8
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 16
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
@a = common global [1 x i32*] zeroinitializer, align 8
@c = common global i32* null, align 8
; The loop here should not be vectorized due to trapping
; constant expression
; Function Attrs: nounwind uwtable
define void @foo5(i32* %A, i32* %B, i32* %trigger) {
; AVX1-LABEL: @foo5(
; AVX1-NEXT: entry:
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
;
; AVX2-LABEL: @foo5(
; AVX2-NEXT: entry:
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX2-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
; AVX2-NEXT: [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
; AVX2-NEXT: br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
; AVX2: if.then.4:
; AVX2-NEXT: [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
; AVX2-NEXT: br label [[FOR_INC_4]]
; AVX2: for.inc.4:
; AVX2-NEXT: [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
; AVX2-NEXT: [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
; AVX2-NEXT: br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
;
; AVX512-LABEL: @foo5(
; AVX512-NEXT: entry:
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
; AVX512-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
; AVX512-NEXT: [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
; AVX512-NEXT: br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
; AVX512: if.then.4:
; AVX512-NEXT: [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
; AVX512-NEXT: br label [[FOR_INC_4]]
; AVX512: for.inc.4:
; AVX512-NEXT: [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
; AVX512-NEXT: [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
; AVX512-NEXT: br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
;
entry:
%A.addr = alloca i32*, align 8
%B.addr = alloca i32*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store i32* %A, i32** %A.addr, align 8
store i32* %B, i32** %B.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 10000
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp slt i32 %3, 100
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load i32*, i32** %B.addr, align 8
%arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
%6 = load i32, i32* %arrayidx3, align 4
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load i32*, i32** %trigger.addr, align 8
%arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
%9 = load i32, i32* %arrayidx5, align 4
%add = add nsw i32 %6, %9
%10 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %10 to i64
%11 = load i32*, i32** %A.addr, align 8
%arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%12 = load i32, i32* %i, align 4
%inc = add nsw i32 %12, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; Reverse loop
;void foo6(double *in, double *out, unsigned size, int *trigger) {
;
; for (int i=SIZE-1; i>=0; i--) {
; if (trigger[i] > 0) {
; out[i] = in[i] + (double) 0.5;
; }
; }
;}
define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
; AVX1-LABEL: @foo6(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
; AVX1-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
; AVX1-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
; AVX1-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX1-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
; AVX1-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
; AVX1-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
; AVX1-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
; AVX1-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
; AVX1-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
; AVX1-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
; AVX1-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
; AVX1-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
; AVX1-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
; AVX1-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
; AVX1-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
; AVX1-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
; AVX1-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
; AVX1-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
; AVX1-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
; AVX1-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
; AVX1-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX1-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
; AVX1: for.end:
; AVX1-NEXT: ret void
; AVX1: if.then.1:
; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
; AVX1-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
; AVX1-NEXT: br label [[FOR_INC_1]]
; AVX1: for.inc.1:
; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
; AVX1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
; AVX1-NEXT: br i1 [[CMP_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
;
; AVX2-LABEL: @foo6(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
; AVX2-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
; AVX2-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
; AVX2-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
; AVX2-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
; AVX2-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
; AVX2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
; AVX2-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
; AVX2-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
; AVX2-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX2-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX2: for.end:
; AVX2-NEXT: ret void
; AVX2: if.then.1:
; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
; AVX2-NEXT: br label [[FOR_INC_1]]
; AVX2: for.inc.1:
; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX2: if.then.2:
; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
; AVX2-NEXT: br label [[FOR_INC_2]]
; AVX2: for.inc.2:
; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX2: if.then.3:
; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
; AVX2-NEXT: br label [[FOR_INC_3]]
; AVX2: for.inc.3:
; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
; AVX2-NEXT: [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
; AVX2-NEXT: br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
;
; AVX512-LABEL: @foo6(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
; AVX512-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
; AVX512-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
; AVX512-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
; AVX512-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -7
; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53
; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53
; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -16
; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -7
; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53
; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -24
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -7
; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53
; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
; AVX512-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer
; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer
; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7
; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7
; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7
; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7
; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56
; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -7
; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -7
; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -16
; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -7
; AVX512-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60
; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -24
; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -7
; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60
; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX512-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
; AVX512: for.end:
; AVX512-NEXT: ret void
; AVX512: if.then.1:
; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
; AVX512-NEXT: br label [[FOR_INC_1]]
; AVX512: for.inc.1:
; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
; AVX512: if.then.2:
; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
; AVX512-NEXT: br label [[FOR_INC_2]]
; AVX512: for.inc.2:
; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
; AVX512: if.then.3:
; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
; AVX512-NEXT: br label [[FOR_INC_3]]
; AVX512: for.inc.3:
; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
; AVX512-NEXT: [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !62
;
entry:
%in.addr = alloca double*, align 8
%out.addr = alloca double*, align 8
%size.addr = alloca i32, align 4
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store double* %in, double** %in.addr, align 8
store double* %out, double** %out.addr, align 8
store i32 %size, i32* %size.addr, align 4
store i32* %trigger, i32** %trigger.addr, align 8
store i32 4095, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp sge i32 %0, 0
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp sgt i32 %3, 0
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load double*, double** %in.addr, align 8
%arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
%6 = load double, double* %arrayidx3, align 8
%add = fadd double %6, 5.000000e-01
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load double*, double** %out.addr, align 8
%arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
store double %add, double* %arrayidx5, align 8
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%9 = load i32, i32* %i, align 4
%dec