blob: a815ea76c2d1f62b3857aad7266af884b4d9ad69 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver5 < %s -slp-revec | FileCheck %s
define <8 x i16> @test(ptr %input, ptr %output_r, <4 x i32> %0) {
; CHECK-LABEL: define <8 x i16> @test(
; CHECK-SAME: ptr [[INPUT:%.*]], ptr [[OUTPUT_R:%.*]], <4 x i32> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br i1 false, label %[[WRITE_BUFFER_8X8_EXIT631:.*]], label %[[IF_THEN_I25_I_I478:.*]]
; CHECK: [[IF_THEN_I25_I_I478]]:
; CHECK-NEXT: [[VECINIT3_I_I285_I326:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = ashr <32 x i32> [[TMP1]], splat (i32 1)
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = ashr <16 x i32> [[TMP3]], splat (i32 1)
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP5]], splat (i32 1)
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[VECINIT3_I_I285_I326]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i32> [[TMP11]], <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP13:%.*]] = ashr <16 x i32> [[TMP12]], splat (i32 1)
; CHECK-NEXT: [[TMP14:%.*]] = add <16 x i32> [[TMP13]], [[TMP4]]
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP14]], <16 x i32> [[TMP7]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP16:%.*]] = add <32 x i32> [[TMP2]], [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = sub <32 x i32> [[TMP2]], [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <32 x i32> [[TMP16]], <32 x i32> [[TMP17]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: br label %[[WRITE_BUFFER_8X8_EXIT631]]
; CHECK: [[WRITE_BUFFER_8X8_EXIT631]]:
; CHECK-NEXT: [[TMP19:%.*]] = phi <32 x i32> [ [[TMP18]], %[[IF_THEN_I25_I_I478]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP22:%.*]] = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP21]], <4 x i32> [[TMP20]])
; CHECK-NEXT: store <8 x i16> [[TMP22]], ptr [[INPUT]], align 1
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP25:%.*]] = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP24]], <4 x i32> [[TMP23]])
; CHECK-NEXT: store <8 x i16> [[TMP25]], ptr [[OUTPUT_R]], align 1
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP28:%.*]] = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP27]], <4 x i32> [[TMP26]])
; CHECK-NEXT: store <8 x i16> [[TMP28]], ptr [[INPUT]], align 1
; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <32 x i32> [[TMP19]], <32 x i32> poison, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP31:%.*]] = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP30]], <4 x i32> [[TMP29]])
; CHECK-NEXT: ret <8 x i16> [[TMP31]]
;
entry:
br i1 false, label %write_buffer_8x8.exit631, label %if.then.i25.i.i478
if.then.i25.i.i478: ; preds = %entry
%vecinit3.i.i285.i326 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
%add.i312.1.i430 = add <4 x i32> %vecinit3.i.i285.i326, splat (i32 1)
%1 = ashr <4 x i32> %0, splat (i32 1)
%2 = ashr <4 x i32> %add.i312.1.i430, splat (i32 1)
%3 = ashr <4 x i32> %0, splat (i32 1)
%add.i330.1.i448 = add <4 x i32> %2, %3
%sub.i348.1.i473 = sub <4 x i32> %1, %add.i330.1.i448
%4 = ashr <4 x i32> %0, splat (i32 1)
%5 = ashr <4 x i32> %0, splat (i32 1)
%add.i327.1.i445 = add <4 x i32> %4, %5
%add.i344.1.i465 = add <4 x i32> %1, %add.i327.1.i445
%6 = ashr <4 x i32> %0, splat (i32 1)
%7 = ashr <4 x i32> %0, splat (i32 1)
%add.i343.1.i463 = add <4 x i32> %6, %7
%8 = ashr <4 x i32> %0, splat (i32 1)
%add.i342.1.i461 = add <4 x i32> %8, %7
%add.i312.i363 = add <4 x i32> %vecinit3.i.i285.i326, splat (i32 1)
%9 = ashr <4 x i32> %0, splat (i32 1)
%10 = ashr <4 x i32> %add.i312.i363, splat (i32 1)
%11 = ashr <4 x i32> %0, splat (i32 1)
%add.i330.i381 = add <4 x i32> %10, %11
%sub.i348.i405 = sub <4 x i32> %9, %add.i330.i381
%12 = ashr <4 x i32> %0, splat (i32 1)
%add.i.i349 = add <4 x i32> %0, splat (i32 1)
%13 = ashr <4 x i32> %add.i.i349, splat (i32 1)
%add.i327.i378 = add <4 x i32> %12, %13
%add.i344.i397 = add <4 x i32> %9, %add.i327.i378
%14 = ashr <4 x i32> %0, splat (i32 1)
%15 = ashr <4 x i32> %0, splat (i32 1)
%add.i343.i395 = add <4 x i32> %14, %15
%16 = ashr <4 x i32> %0, splat (i32 1)
%add.i342.i393 = add <4 x i32> %16, %15
br label %write_buffer_8x8.exit631
write_buffer_8x8.exit631: ; preds = %if.then.i25.i.i478, %entry
%out.sroa.152.1.in = phi <4 x i32> [ %sub.i348.1.i473, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%out.sroa.142.1.in = phi <4 x i32> [ %sub.i348.i405, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%17 = phi <4 x i32> [ %add.i344.1.i465, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%18 = phi <4 x i32> [ %add.i344.i397, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%19 = phi <4 x i32> [ %add.i343.1.i463, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%20 = phi <4 x i32> [ %add.i343.i395, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%21 = phi <4 x i32> [ %add.i342.1.i461, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%22 = phi <4 x i32> [ %add.i342.i393, %if.then.i25.i.i478 ], [ zeroinitializer, %entry ]
%23 = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %22, <4 x i32> %21)
store <8 x i16> %23, ptr %input, align 1
%24 = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %20, <4 x i32> %19)
store <8 x i16> %24, ptr %output_r, align 1
%25 = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %18, <4 x i32> %17)
store <8 x i16> %25, ptr %input, align 1
%26 = tail call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %out.sroa.142.1.in, <4 x i32> %out.sroa.152.1.in)
ret <8 x i16> %26
}
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)