blob: 5c3757867f71f6b9455c355c8f846889146c3a3a [file]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
;; Check that the vectorizer extends a Chain to the next power of two,
;; essentially loading more vector elements than the original
;; code. Alignment and other requirement for vectorization should
;; still be met.
define void @load3to4(ptr %p) {
; CHECK-LABEL: define void @load3to4(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[P_0]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2
%v0 = load i32, ptr %p.0, align 16
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8
ret void
}
define void @load5to8(ptr %p) {
; CHECK-LABEL: define void @load5to8(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P_0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x i16> poison)
; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i16, ptr %p, i32 0
%p.1 = getelementptr i16, ptr %p, i32 1
%p.2 = getelementptr i16, ptr %p, i32 2
%p.3 = getelementptr i16, ptr %p, i32 3
%p.4 = getelementptr i16, ptr %p, i32 4
%v0 = load i16, ptr %p.0, align 16
%v1 = load i16, ptr %p.1, align 2
%v2 = load i16, ptr %p.2, align 4
%v3 = load i16, ptr %p.3, align 2
%v4 = load i16, ptr %p.4, align 8
ret void
}
define void @load6to8(ptr %p) {
; CHECK-LABEL: define void @load6to8(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P_0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i16> poison)
; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i16, ptr %p, i32 0
%p.1 = getelementptr i16, ptr %p, i32 1
%p.2 = getelementptr i16, ptr %p, i32 2
%p.3 = getelementptr i16, ptr %p, i32 3
%p.4 = getelementptr i16, ptr %p, i32 4
%p.5 = getelementptr i16, ptr %p, i32 5
%v0 = load i16, ptr %p.0, align 16
%v1 = load i16, ptr %p.1, align 2
%v2 = load i16, ptr %p.2, align 4
%v3 = load i16, ptr %p.3, align 2
%v4 = load i16, ptr %p.4, align 8
%v5 = load i16, ptr %p.5, align 2
ret void
}
define void @load3to4_unaligned(ptr %p) {
; CHECK-LABEL: define void @load3to4_unaligned(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8
; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2
%v0 = load i32, ptr %p.0, align 8
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8
ret void
}