blob: a9e9cf674c72e43588ff20d4f558aeb2ef22a356 [file]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=infer-alignment,load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s
; POSITIVE TESTS
; store elements 0, 1, and 3, filling the gap with a generated store of element 2
define void @singleGap(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @singleGap(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 2, i64 poison, i64 4>, ptr addrspace(1) align 32 [[OUT]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i64 2, ptr addrspace(1) %getElem1, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i64 4, ptr addrspace(1) %getElem3, align 8
ret void
}
; store elements 0, 1, and 3, filling the gap with a generated store of element 2
define void @singleGapDouble(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @singleGapDouble(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p1(<4 x double> <double 1.000000e+00, double 2.000000e+00, double poison, double 4.000000e+00>, ptr addrspace(1) align 32 [[OUT]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
; CHECK-NEXT: ret void
;
store double 1.0, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store double 2.0, ptr addrspace(1) %getElem1, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store double 4.0, ptr addrspace(1) %getElem3, align 8
ret void
}
; store elements 0, 3, filling the gaps with generated stores of elements 1 and 2
define void @multipleGaps(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @multipleGaps(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 poison, i64 poison, i64 4>, ptr addrspace(1) align 32 [[OUT]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(1) %out, align 32
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i64 4, ptr addrspace(1) %getElem3, align 8
ret void
}
; store elements 0, 3, 4, 7, filling the gaps with generated stores of elements 1, 2, 5, 6
define void @multipleGaps8xi32(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @multipleGaps8xi32(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 poison, i32 poison, i32 2, i32 4, i32 poison, i32 poison, i32 8>, ptr addrspace(1) align 32 [[OUT]], <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true>)
; CHECK-NEXT: ret void
;
store i32 1, ptr addrspace(1) %out, align 32
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i32 2, ptr addrspace(1) %getElem3, align 4
%getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i32 4, ptr addrspace(1) %getElem4, align 4
%getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
store i32 8, ptr addrspace(1) %getElem7, align 4
ret void
}
; store elements 0, 1, 2, 3, 5, 6, 7, filling the gap with a generated store of element 4,
; resulting in two 4xi64 stores with the second one led by a gap filled store.
define void @singleGapLongerChain(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @singleGapLongerChain(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
; CHECK-NEXT: store <4 x i64> <i64 1, i64 2, i64 3, i64 4>, ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: [[GAPFILLGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[GETELEM3]], i64 8
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 poison, i64 6, i64 7, i64 8>, ptr addrspace(1) align 32 [[GAPFILLGEP]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>)
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i64 2, ptr addrspace(1) %getElem1, align 8
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i64 3, ptr addrspace(1) %getElem2, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i64 4, ptr addrspace(1) %getElem3, align 8
%getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 40
store i64 6, ptr addrspace(1) %getElem5, align 8
%getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 48
store i64 7, ptr addrspace(1) %getElem6, align 8
%getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 56
store i64 8, ptr addrspace(1) %getElem7, align 8
ret void
}
; store elements 0, 1, and 3, filling the gap with a generated store of element 2
define void @vectorElements(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @vectorElements(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 7, i32 8>, ptr addrspace(1) align 32 [[OUT]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>)
; CHECK-NEXT: ret void
;
store <2 x i32> <i32 1, i32 2>, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store <2 x i32> <i32 3, i32 4>, ptr addrspace(1) %getElem1, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store <2 x i32> <i32 7, i32 8>, ptr addrspace(1) %getElem3, align 8
ret void
}
; store elements 0, 1, 3. 2 should not end up filled because 8xi64 is not legal.
define void @vectorElements64(ptr addrspace(1) %in) {
; CHECK-LABEL: define void @vectorElements64(
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: store <4 x i64> <i64 1, i64 2, i64 3, i64 4>, ptr addrspace(1) [[IN]], align 32
; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48
; CHECK-NEXT: store <2 x i64> <i64 7, i64 8>, ptr addrspace(1) [[GETELEM1]], align 16
; CHECK-NEXT: ret void
;
store <2 x i64> <i64 1, i64 2>, ptr addrspace(1) %in, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
store <2 x i64> <i64 3, i64 4>, ptr addrspace(1) %getElem1, align 16
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48
store <2 x i64> <i64 7, i64 8>, ptr addrspace(1) %getElem3, align 16
ret void
}
; store elements 0, 1, 2, extending element 3
define void @extendStores(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @extendStores(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 2, i64 3, i64 poison>, ptr addrspace(1) align 32 [[OUT]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i64 2, ptr addrspace(1) %getElem1, align 8
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i64 3, ptr addrspace(1) %getElem2, align 8
ret void
}
; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7
define void @extendStores8xi32(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @extendStores8xi32(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>, ptr addrspace(1) align 32 [[OUT]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
; CHECK-NEXT: ret void
;
store i32 1, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
store i32 2, ptr addrspace(1) %getElem1, align 4
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i32 3, ptr addrspace(1) %getElem2, align 4
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i32 4, ptr addrspace(1) %getElem3, align 4
%getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i32 5, ptr addrspace(1) %getElem4, align 4
ret void
}
; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7
define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; CHECK-LABEL: define void @extendStoresFromLoads8xi32(
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[IN]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x i32> poison)
; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[LOAD27]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) align 32 [[OUT]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr addrspace(1) %in, align 32
%loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
%load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4
%loadGetElem2 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
%load2 = load i32, ptr addrspace(1) %loadGetElem2, align 4
%loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
%load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4
%loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
%load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4
store i32 %load0, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
store i32 %load1, ptr addrspace(1) %getElem1, align 4
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i32 %load2, ptr addrspace(1) %getElem2, align 4
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i32 %load3, ptr addrspace(1) %getElem3, align 4
%getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i32 %load4, ptr addrspace(1) %getElem4, align 4
ret void
}
; store elements 0, 1, 3, 4, gap fill element 2, extend elements 5, 6, 7
define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32(
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 32 [[IN]], <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x i32> poison)
; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 poison, i32 2
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) align 32 [[OUT]], <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false>)
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr addrspace(1) %in, align 32
%loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
%load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4
%loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
%load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4
%loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
%load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4
store i32 %load0, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
store i32 %load1, ptr addrspace(1) %getElem1, align 4
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i32 %load3, ptr addrspace(1) %getElem3, align 4
%getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i32 %load4, ptr addrspace(1) %getElem4, align 4
ret void
}
; NEGATIVE TESTS
; Wrong address space, no gap filling
define void @singleGapWrongAddrSpace(ptr addrspace(3) %out) {
; CHECK-LABEL: define void @singleGapWrongAddrSpace(
; CHECK-SAME: ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: store <2 x i64> <i64 1, i64 2>, ptr addrspace(3) [[OUT]], align 32
; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[OUT]], i32 24
; CHECK-NEXT: store i64 4, ptr addrspace(3) [[GETELEM3]], align 8
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(3) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 8
store i64 2, ptr addrspace(3) %getElem1, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 24
store i64 4, ptr addrspace(3) %getElem3, align 8
ret void
}
; Not enough alignment for masked store, but we still vectorize the smaller vector
define void @singleGapMisaligned(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @singleGapMisaligned(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: store <2 x i64> <i64 1, i64 2>, ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
; CHECK-NEXT: store i64 4, ptr addrspace(1) [[GETELEM3]], align 8
; CHECK-NEXT: ret void
;
store i64 1, ptr addrspace(1) %out, align 16
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i64 2, ptr addrspace(1) %getElem1, align 8
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i64 4, ptr addrspace(1) %getElem3, align 8
ret void
}
; Not enough bytes to meet the minimum masked store size for the target
define void @singleGap4xi32(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @singleGap4xi32(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
; CHECK-NEXT: store <2 x i32> <i32 3, i32 4>, ptr addrspace(1) [[GETELEM2]], align 8
; CHECK-NEXT: ret void
;
store i32 1, ptr addrspace(1) %out, align 32
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i32 3, ptr addrspace(1) %getElem2, align 4
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i32 4, ptr addrspace(1) %getElem3, align 4
ret void
}
; store elements 0, 1, 2, 5, 6, 7. 3 and 4 don't get filled because the heuristic
; only fills 2-element gaps that are in the middle of a multiple of 4
define void @gapInWrongLocation(ptr addrspace(1) %out) {
; CHECK-LABEL: define void @gapInWrongLocation(
; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: store <2 x i32> <i32 1, i32 2>, ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
; CHECK-NEXT: store i32 3, ptr addrspace(1) [[GETELEM2]], align 8
; CHECK-NEXT: [[GETELEM5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20
; CHECK-NEXT: store i32 5, ptr addrspace(1) [[GETELEM5]], align 4
; CHECK-NEXT: [[GETELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
; CHECK-NEXT: store <2 x i32> <i32 6, i32 7>, ptr addrspace(1) [[GETELEM6]], align 8
; CHECK-NEXT: ret void
;
store i32 1, ptr addrspace(1) %out, align 32
%getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
store i32 2, ptr addrspace(1) %getElem1, align 4
%getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i32 3, ptr addrspace(1) %getElem2, align 4
%getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20
store i32 5, ptr addrspace(1) %getElem5, align 4
%getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i32 6, ptr addrspace(1) %getElem6, align 4
%getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
store i32 7, ptr addrspace(1) %getElem7, align 4
ret void
}
; This test has 32-bytes of i8s with a 2-element gap in the middle of each 4-byte chunk.
; i8s are not supported by masked stores on the target, so the stores will not be vectorized.
; The loads, on the other hand, get gap filled.
define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; CHECK-LABEL: define void @cantMaski8(
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p1(ptr addrspace(1) align 32 [[IN]], <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true>, <32 x i8> poison)
; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD334:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3
; CHECK-NEXT: [[LOAD435:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4
; CHECK-NEXT: [[GAPFILL436:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5
; CHECK-NEXT: [[GAPFILL637:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6
; CHECK-NEXT: [[LOAD738:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7
; CHECK-NEXT: [[LOAD839:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8
; CHECK-NEXT: [[GAPFILL840:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9
; CHECK-NEXT: [[GAPFILL1041:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10
; CHECK-NEXT: [[LOAD1142:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11
; CHECK-NEXT: [[LOAD1243:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12
; CHECK-NEXT: [[GAPFILL1244:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13
; CHECK-NEXT: [[GAPFILL1445:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14
; CHECK-NEXT: [[LOAD1546:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15
; CHECK-NEXT: [[LOAD1647:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16
; CHECK-NEXT: [[GAPFILL1648:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17
; CHECK-NEXT: [[GAPFILL1849:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18
; CHECK-NEXT: [[LOAD1950:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19
; CHECK-NEXT: [[LOAD2051:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20
; CHECK-NEXT: [[GAPFILL2052:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21
; CHECK-NEXT: [[GAPFILL2253:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22
; CHECK-NEXT: [[LOAD2354:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23
; CHECK-NEXT: [[LOAD2455:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24
; CHECK-NEXT: [[GAPFILL2456:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25
; CHECK-NEXT: [[GAPFILL2657:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26
; CHECK-NEXT: [[LOAD2758:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27
; CHECK-NEXT: [[LOAD2859:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28
; CHECK-NEXT: [[GAPFILL2860:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29
; CHECK-NEXT: [[GAPFILL3061:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30
; CHECK-NEXT: [[LOAD3162:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31
; CHECK-NEXT: store i8 [[LOAD031]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: [[OUTELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 3
; CHECK-NEXT: store i8 [[LOAD334]], ptr addrspace(1) [[OUTELEM3]], align 1
; CHECK-NEXT: [[OUTELEM4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 4
; CHECK-NEXT: store i8 [[LOAD435]], ptr addrspace(1) [[OUTELEM4]], align 4
; CHECK-NEXT: [[OUTELEM7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 7
; CHECK-NEXT: store i8 [[LOAD738]], ptr addrspace(1) [[OUTELEM7]], align 1
; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
; CHECK-NEXT: store i8 [[LOAD839]], ptr addrspace(1) [[OUTELEM8]], align 8
; CHECK-NEXT: [[OUTELEM11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 11
; CHECK-NEXT: store i8 [[LOAD1142]], ptr addrspace(1) [[OUTELEM11]], align 1
; CHECK-NEXT: [[OUTELEM12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 12
; CHECK-NEXT: store i8 [[LOAD1243]], ptr addrspace(1) [[OUTELEM12]], align 4
; CHECK-NEXT: [[OUTELEM15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 15
; CHECK-NEXT: store i8 [[LOAD1546]], ptr addrspace(1) [[OUTELEM15]], align 1
; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16
; CHECK-NEXT: store i8 [[LOAD1647]], ptr addrspace(1) [[OUTELEM16]], align 16
; CHECK-NEXT: [[OUTELEM19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 19
; CHECK-NEXT: store i8 [[LOAD1950]], ptr addrspace(1) [[OUTELEM19]], align 1
; CHECK-NEXT: [[OUTELEM20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20
; CHECK-NEXT: store i8 [[LOAD2051]], ptr addrspace(1) [[OUTELEM20]], align 4
; CHECK-NEXT: [[OUTELEM23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 23
; CHECK-NEXT: store i8 [[LOAD2354]], ptr addrspace(1) [[OUTELEM23]], align 1
; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
; CHECK-NEXT: store i8 [[LOAD2455]], ptr addrspace(1) [[OUTELEM24]], align 8
; CHECK-NEXT: [[OUTELEM27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 27
; CHECK-NEXT: store i8 [[LOAD2758]], ptr addrspace(1) [[OUTELEM27]], align 1
; CHECK-NEXT: [[OUTELEM28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 28
; CHECK-NEXT: store i8 [[LOAD2859]], ptr addrspace(1) [[OUTELEM28]], align 4
; CHECK-NEXT: [[OUTELEM31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 31
; CHECK-NEXT: store i8 [[LOAD3162]], ptr addrspace(1) [[OUTELEM31]], align 1
; CHECK-NEXT: ret void
;
%load0 = load i8, ptr addrspace(1) %in, align 32
%getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 3
%load3 = load i8, ptr addrspace(1) %getElem3, align 1
%getElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
%load4 = load i8, ptr addrspace(1) %getElem4, align 4
%getElem7 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 7
%load7 = load i8, ptr addrspace(1) %getElem7, align 1
%getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
%load8 = load i8, ptr addrspace(1) %getElem8, align 8
%getElem11 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 11
%load11 = load i8, ptr addrspace(1) %getElem11, align 1
%getElem12 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
%load12 = load i8, ptr addrspace(1) %getElem12, align 4
%getElem15 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 15
%load15 = load i8, ptr addrspace(1) %getElem15, align 1
%getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
%load16 = load i8, ptr addrspace(1) %getElem16, align 16
%getElem19 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 19
%load19 = load i8, ptr addrspace(1) %getElem19, align 1
%getElem20 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 20
%load20 = load i8, ptr addrspace(1) %getElem20, align 4
%getElem23 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 23
%load23 = load i8, ptr addrspace(1) %getElem23, align 1
%getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24
%load24 = load i8, ptr addrspace(1) %getElem24, align 8
%getElem27 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 27
%load27 = load i8, ptr addrspace(1) %getElem27, align 1
%getElem28 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 28
%load28 = load i8, ptr addrspace(1) %getElem28, align 4
%getElem31 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 31
%load31 = load i8, ptr addrspace(1) %getElem31, align 1
store i8 %load0, ptr addrspace(1) %out, align 32
%outElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 3
store i8 %load3, ptr addrspace(1) %outElem3, align 1
%outElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
store i8 %load4, ptr addrspace(1) %outElem4, align 4
%outElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 7
store i8 %load7, ptr addrspace(1) %outElem7, align 1
%outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i8 %load8, ptr addrspace(1) %outElem8, align 8
%outElem11 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 11
store i8 %load11, ptr addrspace(1) %outElem11, align 1
%outElem12 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
store i8 %load12, ptr addrspace(1) %outElem12, align 4
%outElem15 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 15
store i8 %load15, ptr addrspace(1) %outElem15, align 1
%outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i8 %load16, ptr addrspace(1) %outElem16, align 16
%outElem19 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 19
store i8 %load19, ptr addrspace(1) %outElem19, align 1
%outElem20 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20
store i8 %load20, ptr addrspace(1) %outElem20, align 4
%outElem23 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 23
store i8 %load23, ptr addrspace(1) %outElem23, align 1
%outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i8 %load24, ptr addrspace(1) %outElem24, align 8
%outElem27 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 27
store i8 %load27, ptr addrspace(1) %outElem27, align 1
%outElem28 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
store i8 %load28, ptr addrspace(1) %outElem28, align 4
%outElem31 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 31
store i8 %load31, ptr addrspace(1) %outElem31, align 1
ret void
}
; This test has 32-bytes of i16s with a 2-element gap in the middle of each 4-element chunk.
; i16s are not supported by masked stores on the target, so the stores will not be vectorized.
; The loads, on the other hand, get gap filled.
define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; CHECK-LABEL: define void @cantMaski16(
; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.masked.load.v16i16.p1(ptr addrspace(1) align 32 [[IN]], <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true>, <16 x i16> poison)
; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD318:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3
; CHECK-NEXT: [[LOAD419:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4
; CHECK-NEXT: [[GAPFILL420:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5
; CHECK-NEXT: [[GAPFILL621:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6
; CHECK-NEXT: [[LOAD722:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7
; CHECK-NEXT: [[LOAD823:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8
; CHECK-NEXT: [[GAPFILL824:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9
; CHECK-NEXT: [[GAPFILL1025:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10
; CHECK-NEXT: [[LOAD1126:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11
; CHECK-NEXT: [[LOAD1227:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12
; CHECK-NEXT: [[GAPFILL1228:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13
; CHECK-NEXT: [[GAPFILL1429:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14
; CHECK-NEXT: [[LOAD1530:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
; CHECK-NEXT: store i16 [[LOAD015]], ptr addrspace(1) [[OUT]], align 32
; CHECK-NEXT: [[OUTELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 6
; CHECK-NEXT: store i16 [[LOAD318]], ptr addrspace(1) [[OUTELEM6]], align 2
; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
; CHECK-NEXT: store i16 [[LOAD419]], ptr addrspace(1) [[OUTELEM8]], align 8
; CHECK-NEXT: [[OUTELEM14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 14
; CHECK-NEXT: store i16 [[LOAD722]], ptr addrspace(1) [[OUTELEM14]], align 2
; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16
; CHECK-NEXT: store i16 [[LOAD823]], ptr addrspace(1) [[OUTELEM16]], align 16
; CHECK-NEXT: [[OUTELEM22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 22
; CHECK-NEXT: store i16 [[LOAD1126]], ptr addrspace(1) [[OUTELEM22]], align 2
; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
; CHECK-NEXT: store i16 [[LOAD1227]], ptr addrspace(1) [[OUTELEM24]], align 8
; CHECK-NEXT: [[OUTELEM30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 30
; CHECK-NEXT: store i16 [[LOAD1530]], ptr addrspace(1) [[OUTELEM30]], align 2
; CHECK-NEXT: ret void
;
%load0 = load i16, ptr addrspace(1) %in, align 32
%getElem6 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 6
%load3 = load i16, ptr addrspace(1) %getElem6, align 2
%getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
%load4 = load i16, ptr addrspace(1) %getElem8, align 8
%getElem14 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 14
%load7 = load i16, ptr addrspace(1) %getElem14, align 2
%getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
%load8 = load i16, ptr addrspace(1) %getElem16, align 16
%getElem22 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 22
%load11 = load i16, ptr addrspace(1) %getElem22, align 2
%getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24
%load12 = load i16, ptr addrspace(1) %getElem24, align 8
%getElem30 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 30
%load15 = load i16, ptr addrspace(1) %getElem30, align 2
store i16 %load0, ptr addrspace(1) %out, align 32
%outElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 6
store i16 %load3, ptr addrspace(1) %outElem6, align 2
%outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
store i16 %load4, ptr addrspace(1) %outElem8, align 8
%outElem14 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 14
store i16 %load7, ptr addrspace(1) %outElem14, align 2
%outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
store i16 %load8, ptr addrspace(1) %outElem16, align 16
%outElem22 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 22
store i16 %load11, ptr addrspace(1) %outElem22, align 2
%outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
store i16 %load12, ptr addrspace(1) %outElem24, align 8
%outElem30 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 30
store i16 %load15, ptr addrspace(1) %outElem30, align 2
ret void
}