| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE2 |
| ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4 |
| ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 |
| ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 |
| ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE2 |
| ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4 |
| ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 |
| ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 |
| |
| ; PR128424 |
| |
| %"struct.std::array8" = type { [8 x i16] } |
| %"struct.std::array16" = type { [16 x i8] } |
| |
| define { i64, i64 } @avgr_16_u8(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) { |
| ; SSE2-LABEL: @avgr_16_u8( |
| ; SSE2-NEXT: entry: |
| ; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16 |
| ; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0 |
| ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1 |
| ; SSE2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16) |
| ; SSE2-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24) |
| ; SSE2-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32) |
| ; SSE2-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40) |
| ; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56 |
| ; SSE2-NEXT: [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16 |
| ; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE2-NEXT: [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16 |
| ; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0 |
| ; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; SSE2-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16) |
| ; SSE2-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24) |
| ; SSE2-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32) |
| ; SSE2-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40) |
| ; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE2-NEXT: [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16 |
| ; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE2-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 |
| ; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1 |
| ; SSE2-NEXT: [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8) |
| ; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56 |
| ; SSE2-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0 |
| ; SSE2-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1 |
| ; SSE2-NEXT: [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8) |
| ; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56 |
| ; SSE2-NEXT: [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255 |
| ; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56 |
| ; SSE2-NEXT: [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255 |
| ; SSE2-NEXT: [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]] |
| ; SSE2-NEXT: [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]] |
| ; SSE2-NEXT: [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]] |
| ; SSE2-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]] |
| ; SSE2-NEXT: [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]] |
| ; SSE2-NEXT: [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255) |
| ; SSE2-NEXT: [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]] |
| ; SSE2-NEXT: [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255 |
| ; SSE2-NEXT: [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255 |
| ; SSE2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1 |
| ; SSE2-NEXT: [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1 |
| ; SSE2-NEXT: [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]] |
| ; SSE2-NEXT: [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]] |
| ; SSE2-NEXT: [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1 |
| ; SSE2-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1 |
| ; SSE2-NEXT: [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]] |
| ; SSE2-NEXT: [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]] |
| ; SSE2-NEXT: [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55 |
| ; SSE2-NEXT: [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47 |
| ; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936 |
| ; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280 |
| ; SSE2-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47 |
| ; SSE2-NEXT: [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55 |
| ; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280 |
| ; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936 |
| ; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39) |
| ; SSE2-NEXT: [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880) |
| ; SSE2-NEXT: [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0 |
| ; SSE2-NEXT: [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1 |
| ; SSE2-NEXT: [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]] |
| ; SSE2-NEXT: [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31) |
| ; SSE2-NEXT: [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480) |
| ; SSE2-NEXT: [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]] |
| ; SSE2-NEXT: [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23) |
| ; SSE2-NEXT: [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080) |
| ; SSE2-NEXT: [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]] |
| ; SSE2-NEXT: [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15) |
| ; SSE2-NEXT: [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680) |
| ; SSE2-NEXT: [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7) |
| ; SSE2-NEXT: [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]] |
| ; SSE2-NEXT: [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256) |
| ; SSE2-NEXT: [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64> |
| ; SSE2-NEXT: [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]] |
| ; SSE2-NEXT: [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]] |
| ; SSE2-NEXT: [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0 |
| ; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0 |
| ; SSE2-NEXT: [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1 |
| ; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1 |
| ; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; SSE4-LABEL: @avgr_16_u8( |
| ; SSE4-NEXT: entry: |
| ; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16 |
| ; SSE4-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0 |
| ; SSE4-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1 |
| ; SSE4-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16) |
| ; SSE4-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24) |
| ; SSE4-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32) |
| ; SSE4-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40) |
| ; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56 |
| ; SSE4-NEXT: [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16 |
| ; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE4-NEXT: [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16 |
| ; SSE4-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0 |
| ; SSE4-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; SSE4-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16) |
| ; SSE4-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24) |
| ; SSE4-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32) |
| ; SSE4-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40) |
| ; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE4-NEXT: [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16 |
| ; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE4-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 |
| ; SSE4-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1 |
| ; SSE4-NEXT: [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8) |
| ; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56 |
| ; SSE4-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0 |
| ; SSE4-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1 |
| ; SSE4-NEXT: [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8) |
| ; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56 |
| ; SSE4-NEXT: [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255 |
| ; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56 |
| ; SSE4-NEXT: [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255 |
| ; SSE4-NEXT: [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]] |
| ; SSE4-NEXT: [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]] |
| ; SSE4-NEXT: [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]] |
| ; SSE4-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]] |
| ; SSE4-NEXT: [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]] |
| ; SSE4-NEXT: [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255) |
| ; SSE4-NEXT: [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]] |
| ; SSE4-NEXT: [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255 |
| ; SSE4-NEXT: [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255 |
| ; SSE4-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1 |
| ; SSE4-NEXT: [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1 |
| ; SSE4-NEXT: [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]] |
| ; SSE4-NEXT: [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]] |
| ; SSE4-NEXT: [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1 |
| ; SSE4-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1 |
| ; SSE4-NEXT: [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]] |
| ; SSE4-NEXT: [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]] |
| ; SSE4-NEXT: [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55 |
| ; SSE4-NEXT: [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47 |
| ; SSE4-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936 |
| ; SSE4-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280 |
| ; SSE4-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47 |
| ; SSE4-NEXT: [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55 |
| ; SSE4-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280 |
| ; SSE4-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936 |
| ; SSE4-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]] |
| ; SSE4-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]] |
| ; SSE4-NEXT: [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39) |
| ; SSE4-NEXT: [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880) |
| ; SSE4-NEXT: [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0 |
| ; SSE4-NEXT: [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1 |
| ; SSE4-NEXT: [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]] |
| ; SSE4-NEXT: [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31) |
| ; SSE4-NEXT: [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480) |
| ; SSE4-NEXT: [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]] |
| ; SSE4-NEXT: [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23) |
| ; SSE4-NEXT: [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080) |
| ; SSE4-NEXT: [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]] |
| ; SSE4-NEXT: [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15) |
| ; SSE4-NEXT: [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680) |
| ; SSE4-NEXT: [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7) |
| ; SSE4-NEXT: [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]] |
| ; SSE4-NEXT: [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256) |
| ; SSE4-NEXT: [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64> |
| ; SSE4-NEXT: [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]] |
| ; SSE4-NEXT: [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]] |
| ; SSE4-NEXT: [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0 |
| ; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0 |
| ; SSE4-NEXT: [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1 |
| ; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1 |
| ; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX-LABEL: @avgr_16_u8( |
| ; AVX-NEXT: entry: |
| ; AVX-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16 |
| ; AVX-NEXT: [[TMP1:%.*]] = lshr i16 [[TMP0]], 8 |
| ; AVX-NEXT: [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i16 |
| ; AVX-NEXT: [[TMP3:%.*]] = lshr i16 [[TMP2]], 8 |
| ; AVX-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16 |
| ; AVX-NEXT: [[TMP5:%.*]] = lshr i16 [[TMP4]], 8 |
| ; AVX-NEXT: [[TMP6:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i16 |
| ; AVX-NEXT: [[TMP7:%.*]] = lshr i16 [[TMP6]], 8 |
| ; AVX-NEXT: [[CONV1:%.*]] = and i64 [[A_COERCE0]], 255 |
| ; AVX-NEXT: [[CONV4:%.*]] = and i64 [[B_COERCE0]], 255 |
| ; AVX-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[CONV1]], 1 |
| ; AVX-NEXT: [[ADD5:%.*]] = add nuw nsw i64 [[ADD]], [[CONV4]] |
| ; AVX-NEXT: [[ADD_1:%.*]] = add nuw nsw i16 [[TMP1]], 1 |
| ; AVX-NEXT: [[ADD5_1:%.*]] = add nuw nsw i16 [[ADD_1]], [[TMP5]] |
| ; AVX-NEXT: [[CONV1_8:%.*]] = and i64 [[A_COERCE1]], 255 |
| ; AVX-NEXT: [[CONV4_8:%.*]] = and i64 [[B_COERCE1]], 255 |
| ; AVX-NEXT: [[ADD_8:%.*]] = add nuw nsw i64 [[CONV1_8]], 1 |
| ; AVX-NEXT: [[ADD5_8:%.*]] = add nuw nsw i64 [[ADD_8]], [[CONV4_8]] |
| ; AVX-NEXT: [[ADD_9:%.*]] = add nuw nsw i16 [[TMP3]], 1 |
| ; AVX-NEXT: [[ADD5_9:%.*]] = add nuw nsw i16 [[ADD_9]], [[TMP7]] |
| ; AVX-NEXT: [[TMP8:%.*]] = shl nuw i16 [[ADD5_1]], 7 |
| ; AVX-NEXT: [[TMP9:%.*]] = and i16 [[TMP8]], -256 |
| ; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE0]], i64 0 |
| ; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7> |
| ; AVX-NEXT: [[TMP12:%.*]] = lshr <8 x i64> [[TMP11]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP12]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0> |
| ; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP9]] to i64 |
| ; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0]], i64 0 |
| ; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i64> [[TMP14]], i64 [[ADD5]], i64 6 |
| ; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i64> [[TMP15]], i64 [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED]], i64 7 |
| ; AVX-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7> |
| ; AVX-NEXT: [[TMP18:%.*]] = lshr <8 x i64> [[TMP17]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0> |
| ; AVX-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP18]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1> |
| ; AVX-NEXT: [[TMP20:%.*]] = add nuw nsw <8 x i64> [[TMP13]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i64> [[TMP19]], [[TMP20]] |
| ; AVX-NEXT: [[TMP22:%.*]] = shl nuw <8 x i64> [[TMP21]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP22]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1> |
| ; AVX-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]]) |
| ; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 |
| ; AVX-NEXT: [[TMP25:%.*]] = shl nuw i16 [[ADD5_9]], 7 |
| ; AVX-NEXT: [[TMP26:%.*]] = and i16 [[TMP25]], -256 |
| ; AVX-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE1]], i64 0 |
| ; AVX-NEXT: [[TMP28:%.*]] = shufflevector <8 x i64> [[TMP27]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7> |
| ; AVX-NEXT: [[TMP29:%.*]] = lshr <8 x i64> [[TMP28]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP30:%.*]] = and <8 x i64> [[TMP29]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0> |
| ; AVX-NEXT: [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP26]] to i64 |
| ; AVX-NEXT: [[TMP31:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1]], i64 0 |
| ; AVX-NEXT: [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[ADD5_8]], i64 6 |
| ; AVX-NEXT: [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED]], i64 7 |
| ; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7> |
| ; AVX-NEXT: [[TMP35:%.*]] = lshr <8 x i64> [[TMP34]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0> |
| ; AVX-NEXT: [[TMP36:%.*]] = and <8 x i64> [[TMP35]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1> |
| ; AVX-NEXT: [[TMP37:%.*]] = add nuw nsw <8 x i64> [[TMP30]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP38:%.*]] = add nuw nsw <8 x i64> [[TMP36]], [[TMP37]] |
| ; AVX-NEXT: [[TMP39:%.*]] = shl nuw <8 x i64> [[TMP38]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0> |
| ; AVX-NEXT: [[TMP40:%.*]] = and <8 x i64> [[TMP39]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1> |
| ; AVX-NEXT: [[TMP41:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP40]]) |
| ; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP41]], 1 |
| ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| entry: |
| %retval = alloca %"struct.std::array16", align 1 |
| %a = alloca %"struct.std::array16", align 1 |
| %b = alloca %"struct.std::array16", align 1 |
| store i64 %a.coerce0, ptr %a, align 1 |
| %0 = getelementptr inbounds nuw i8, ptr %a, i64 8 |
| store i64 %a.coerce1, ptr %0, align 1 |
| store i64 %b.coerce0, ptr %b, align 1 |
| %1 = getelementptr inbounds nuw i8, ptr %b, i64 8 |
| store i64 %b.coerce1, ptr %1, align 1 |
| br label %for.cond |
| |
| for.cond: ; preds = %for.body, %entry |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %cmp = icmp samesign ult i64 %i.0, 16 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.cond |
| %.fca.0.load = load i64, ptr %retval, align 1 |
| %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0 |
| %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8 |
| %.fca.1.load = load i64, ptr %.fca.1.gep, align 1 |
| %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1 |
| ret { i64, i64 } %.fca.1.insert |
| |
| for.body: ; preds = %for.cond |
| %arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0 |
| %2 = load i8, ptr %arrayidx.i, align 1 |
| %conv1 = zext i8 %2 to i16 |
| %arrayidx.i12 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0 |
| %3 = load i8, ptr %arrayidx.i12, align 1 |
| %conv4 = zext i8 %3 to i16 |
| %add = add nuw nsw i16 %conv1, %conv4 |
| %add5 = add nuw nsw i16 %add, 1 |
| %shr = lshr i16 %add5, 1 |
| %conv6 = trunc i16 %shr to i8 |
| %arrayidx.i13 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0 |
| store i8 %conv6, ptr %arrayidx.i13, align 1 |
| %inc = add nuw nsw i64 %i.0, 1 |
| br label %for.cond |
| } |
| |
| define { i64, i64 } @avgr_16_u8_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) { |
| ; SSE2-LABEL: @avgr_16_u8_alt( |
| ; SSE2-NEXT: entry: |
| ; SSE2-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i8 |
| ; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 8 |
| ; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16 |
| ; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24 |
| ; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE2-NEXT: [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40 |
| ; SSE2-NEXT: [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56 |
| ; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i8 |
| ; SSE2-NEXT: [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 8 |
| ; SSE2-NEXT: [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16 |
| ; SSE2-NEXT: [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24 |
| ; SSE2-NEXT: [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32 |
| ; SSE2-NEXT: [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40 |
| ; SSE2-NEXT: [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56 |
| ; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i8 |
| ; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 8 |
| ; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16 |
| ; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24 |
| ; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE2-NEXT: [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40 |
| ; SSE2-NEXT: [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56 |
| ; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i8 |
| ; SSE2-NEXT: [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 8 |
| ; SSE2-NEXT: [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16 |
| ; SSE2-NEXT: [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24 |
| ; SSE2-NEXT: [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE2-NEXT: [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40 |
| ; SSE2-NEXT: [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56 |
| ; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8 |
| ; SSE2-NEXT: [[SHR:%.*]] = lshr i8 [[A_SROA_0_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5:%.*]] = lshr i8 [[B_SROA_0_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW:%.*]] = add nuw i8 [[SHR5]], [[SHR]] |
| ; SSE2-NEXT: [[OR21:%.*]] = or i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[A_SROA_0_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP0:%.*]] = and i8 [[OR21]], 1 |
| ; SSE2-NEXT: [[ADD12:%.*]] = add nuw i8 [[NARROW]], [[TMP0]] |
| ; SSE2-NEXT: [[SHR_1:%.*]] = lshr i8 [[A_SROA_2_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_1:%.*]] = lshr i8 [[B_SROA_2_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_1:%.*]] = add nuw i8 [[SHR5_1]], [[SHR_1]] |
| ; SSE2-NEXT: [[OR21_1:%.*]] = or i8 [[B_SROA_2_0_EXTRACT_TRUNC]], [[A_SROA_2_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP1:%.*]] = and i8 [[OR21_1]], 1 |
| ; SSE2-NEXT: [[ADD12_1:%.*]] = add nuw i8 [[NARROW_1]], [[TMP1]] |
| ; SSE2-NEXT: [[SHR_2:%.*]] = lshr i8 [[A_SROA_3_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_2:%.*]] = lshr i8 [[B_SROA_3_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_2:%.*]] = add nuw i8 [[SHR5_2]], [[SHR_2]] |
| ; SSE2-NEXT: [[OR21_2:%.*]] = or i8 [[B_SROA_3_0_EXTRACT_TRUNC]], [[A_SROA_3_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP2:%.*]] = and i8 [[OR21_2]], 1 |
| ; SSE2-NEXT: [[ADD12_2:%.*]] = add nuw i8 [[NARROW_2]], [[TMP2]] |
| ; SSE2-NEXT: [[SHR_3:%.*]] = lshr i8 [[A_SROA_4_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_3:%.*]] = lshr i8 [[B_SROA_4_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_3:%.*]] = add nuw i8 [[SHR5_3]], [[SHR_3]] |
| ; SSE2-NEXT: [[OR21_3:%.*]] = or i8 [[B_SROA_4_0_EXTRACT_TRUNC]], [[A_SROA_4_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP3:%.*]] = and i8 [[OR21_3]], 1 |
| ; SSE2-NEXT: [[ADD12_3:%.*]] = add nuw i8 [[NARROW_3]], [[TMP3]] |
| ; SSE2-NEXT: [[SHR_4:%.*]] = lshr i8 [[A_SROA_5_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_4:%.*]] = lshr i8 [[B_SROA_5_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_4:%.*]] = add nuw i8 [[SHR5_4]], [[SHR_4]] |
| ; SSE2-NEXT: [[OR21_4:%.*]] = or i8 [[B_SROA_5_0_EXTRACT_TRUNC]], [[A_SROA_5_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP4:%.*]] = and i8 [[OR21_4]], 1 |
| ; SSE2-NEXT: [[ADD12_4:%.*]] = add nuw i8 [[NARROW_4]], [[TMP4]] |
| ; SSE2-NEXT: [[SHR_5:%.*]] = lshr i8 [[A_SROA_6_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_5:%.*]] = lshr i8 [[B_SROA_6_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_5:%.*]] = add nuw i8 [[SHR5_5]], [[SHR_5]] |
| ; SSE2-NEXT: [[OR21_5:%.*]] = or i8 [[B_SROA_6_0_EXTRACT_TRUNC]], [[A_SROA_6_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP5:%.*]] = and i8 [[OR21_5]], 1 |
| ; SSE2-NEXT: [[ADD12_5:%.*]] = add nuw i8 [[NARROW_5]], [[TMP5]] |
| ; SSE2-NEXT: [[SHR_6:%.*]] = lshr i8 [[A_SROA_7_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_6:%.*]] = lshr i8 [[B_SROA_7_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_6:%.*]] = add nuw i8 [[SHR5_6]], [[SHR_6]] |
| ; SSE2-NEXT: [[OR21_6:%.*]] = or i8 [[B_SROA_7_0_EXTRACT_TRUNC]], [[A_SROA_7_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP6:%.*]] = and i8 [[OR21_6]], 1 |
| ; SSE2-NEXT: [[ADD12_6:%.*]] = add nuw i8 [[NARROW_6]], [[TMP6]] |
| ; SSE2-NEXT: [[SHR_7:%.*]] = lshr i8 [[A_SROA_8_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_7:%.*]] = lshr i8 [[B_SROA_8_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_7:%.*]] = add nuw i8 [[SHR5_7]], [[SHR_7]] |
| ; SSE2-NEXT: [[OR21_7:%.*]] = or i8 [[B_SROA_8_0_EXTRACT_TRUNC]], [[A_SROA_8_0_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP7:%.*]] = and i8 [[OR21_7]], 1 |
| ; SSE2-NEXT: [[ADD12_7:%.*]] = add nuw i8 [[NARROW_7]], [[TMP7]] |
| ; SSE2-NEXT: [[SHR_8:%.*]] = lshr i8 [[A_SROA_9_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_8:%.*]] = lshr i8 [[B_SROA_9_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_8:%.*]] = add nuw i8 [[SHR5_8]], [[SHR_8]] |
| ; SSE2-NEXT: [[OR21_8:%.*]] = or i8 [[B_SROA_9_8_EXTRACT_TRUNC]], [[A_SROA_9_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP8:%.*]] = and i8 [[OR21_8]], 1 |
| ; SSE2-NEXT: [[ADD12_8:%.*]] = add nuw i8 [[NARROW_8]], [[TMP8]] |
| ; SSE2-NEXT: [[SHR_9:%.*]] = lshr i8 [[A_SROA_11_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_9:%.*]] = lshr i8 [[B_SROA_11_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_9:%.*]] = add nuw i8 [[SHR5_9]], [[SHR_9]] |
| ; SSE2-NEXT: [[OR21_9:%.*]] = or i8 [[B_SROA_11_8_EXTRACT_TRUNC]], [[A_SROA_11_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP9:%.*]] = and i8 [[OR21_9]], 1 |
| ; SSE2-NEXT: [[ADD12_9:%.*]] = add nuw i8 [[NARROW_9]], [[TMP9]] |
| ; SSE2-NEXT: [[SHR_10:%.*]] = lshr i8 [[A_SROA_12_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_10:%.*]] = lshr i8 [[B_SROA_12_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_10:%.*]] = add nuw i8 [[SHR5_10]], [[SHR_10]] |
| ; SSE2-NEXT: [[OR21_10:%.*]] = or i8 [[B_SROA_12_8_EXTRACT_TRUNC]], [[A_SROA_12_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP10:%.*]] = and i8 [[OR21_10]], 1 |
| ; SSE2-NEXT: [[ADD12_10:%.*]] = add nuw i8 [[NARROW_10]], [[TMP10]] |
| ; SSE2-NEXT: [[SHR_11:%.*]] = lshr i8 [[A_SROA_13_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_11:%.*]] = lshr i8 [[B_SROA_13_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_11:%.*]] = add nuw i8 [[SHR5_11]], [[SHR_11]] |
| ; SSE2-NEXT: [[OR21_11:%.*]] = or i8 [[B_SROA_13_8_EXTRACT_TRUNC]], [[A_SROA_13_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP11:%.*]] = and i8 [[OR21_11]], 1 |
| ; SSE2-NEXT: [[ADD12_11:%.*]] = add nuw i8 [[NARROW_11]], [[TMP11]] |
| ; SSE2-NEXT: [[SHR_12:%.*]] = lshr i8 [[A_SROA_14_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_12:%.*]] = lshr i8 [[B_SROA_14_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_12:%.*]] = add nuw i8 [[SHR5_12]], [[SHR_12]] |
| ; SSE2-NEXT: [[OR21_12:%.*]] = or i8 [[B_SROA_14_8_EXTRACT_TRUNC]], [[A_SROA_14_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP12:%.*]] = and i8 [[OR21_12]], 1 |
| ; SSE2-NEXT: [[ADD12_12:%.*]] = add nuw i8 [[NARROW_12]], [[TMP12]] |
| ; SSE2-NEXT: [[SHR_13:%.*]] = lshr i8 [[A_SROA_15_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_13:%.*]] = lshr i8 [[B_SROA_15_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_13:%.*]] = add nuw i8 [[SHR5_13]], [[SHR_13]] |
| ; SSE2-NEXT: [[OR21_13:%.*]] = or i8 [[B_SROA_15_8_EXTRACT_TRUNC]], [[A_SROA_15_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP13:%.*]] = and i8 [[OR21_13]], 1 |
| ; SSE2-NEXT: [[ADD12_13:%.*]] = add nuw i8 [[NARROW_13]], [[TMP13]] |
| ; SSE2-NEXT: [[SHR_14:%.*]] = lshr i8 [[A_SROA_16_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_14:%.*]] = lshr i8 [[B_SROA_16_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_14:%.*]] = add nuw i8 [[SHR5_14]], [[SHR_14]] |
| ; SSE2-NEXT: [[OR21_14:%.*]] = or i8 [[B_SROA_16_8_EXTRACT_TRUNC]], [[A_SROA_16_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP14:%.*]] = and i8 [[OR21_14]], 1 |
| ; SSE2-NEXT: [[ADD12_14:%.*]] = add nuw i8 [[NARROW_14]], [[TMP14]] |
| ; SSE2-NEXT: [[SHR_15:%.*]] = lshr i8 [[A_SROA_17_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_15:%.*]] = lshr i8 [[B_SROA_17_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_15:%.*]] = add nuw i8 [[SHR5_15]], [[SHR_15]] |
| ; SSE2-NEXT: [[OR21_15:%.*]] = or i8 [[B_SROA_17_8_EXTRACT_TRUNC]], [[A_SROA_17_8_EXTRACT_TRUNC]] |
| ; SSE2-NEXT: [[TMP15:%.*]] = and i8 [[OR21_15]], 1 |
| ; SSE2-NEXT: [[ADD12_15:%.*]] = add nuw i8 [[NARROW_15]], [[TMP15]] |
| ; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_7]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_8_0_INSERT_EXT]], 56 |
| ; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_6]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_0_INSERT_EXT]], 48 |
| ; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_SHIFT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_5]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_6_0_INSERT_EXT]], 40 |
| ; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], [[RETVAL_SROA_6_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_4]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_5_0_INSERT_EXT]], 32 |
| ; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_6_0_INSERT_INSERT]], [[RETVAL_SROA_5_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_3]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 24 |
| ; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_5_0_INSERT_INSERT]], [[RETVAL_SROA_4_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_2]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 16 |
| ; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_1]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 8 |
| ; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_INSERT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i8 [[ADD12]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_MASK]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_0_0_INSERT_MASK]], [[RETVAL_SROA_0_0_INSERT_EXT]] |
| ; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 |
| ; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_15]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_17_8_INSERT_EXT]], 56 |
| ; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_14]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_16_8_INSERT_EXT]], 48 |
| ; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_SHIFT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_13]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_15_8_INSERT_EXT]], 40 |
| ; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], [[RETVAL_SROA_15_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_12]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_14_8_INSERT_EXT]], 32 |
| ; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_15_8_INSERT_INSERT]], [[RETVAL_SROA_14_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_11]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_13_8_INSERT_EXT]], 24 |
| ; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_14_8_INSERT_INSERT]], [[RETVAL_SROA_13_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_12_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_10]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_12_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_12_8_INSERT_EXT]], 16 |
| ; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_9]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_11_8_INSERT_EXT]], 8 |
| ; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_13_8_INSERT_INSERT]], [[RETVAL_SROA_12_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_8]] to i64 |
| ; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_11_8_INSERT_MASK]], [[RETVAL_SROA_11_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_MASK]], [[RETVAL_SROA_9_8_INSERT_EXT]] |
| ; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_9_8_INSERT_INSERT]], 1 |
| ; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; SSE4-LABEL: @avgr_16_u8_alt( |
| ; SSE4-NEXT: entry: |
| ; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 56 |
| ; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40 |
| ; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE4-NEXT: [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24 |
| ; SSE4-NEXT: [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16 |
| ; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 8 |
| ; SSE4-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i8 |
| ; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 56 |
| ; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE4-NEXT: [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40 |
| ; SSE4-NEXT: [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24 |
| ; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16 |
| ; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 8 |
| ; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i8 |
| ; SSE4-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_8_0_EXTRACT_TRUNC]], i64 0 |
| ; SSE4-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> [[TMP0]], i8 [[A_SROA_7_0_EXTRACT_TRUNC]], i64 1 |
| ; SSE4-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[A_SROA_6_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[A_SROA_5_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 4 |
| ; SSE4-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 5 |
| ; SSE4-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 6 |
| ; SSE4-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 7 |
| ; SSE4-NEXT: [[TMP8:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 0 |
| ; SSE4-NEXT: [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[B_SROA_7_0_EXTRACT_TRUNC]], i64 1 |
| ; SSE4-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP10]], i8 [[B_SROA_6_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[B_SROA_5_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP13:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 4 |
| ; SSE4-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 5 |
| ; SSE4-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 6 |
| ; SSE4-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[B_SROA_8_0_EXTRACT_TRUNC]], i64 7 |
| ; SSE4-NEXT: [[TMP17:%.*]] = lshr <8 x i8> [[TMP16]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP18:%.*]] = add nuw <8 x i8> [[TMP17]], [[TMP8]] |
| ; SSE4-NEXT: [[TMP19:%.*]] = or <8 x i8> [[TMP16]], [[TMP7]] |
| ; SSE4-NEXT: [[TMP20:%.*]] = and <8 x i8> [[TMP19]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP21:%.*]] = add nuw <8 x i8> [[TMP18]], [[TMP20]] |
| ; SSE4-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[TMP21]] to <8 x i64> |
| ; SSE4-NEXT: [[TMP23:%.*]] = shl nuw <8 x i64> [[TMP22]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; SSE4-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]]) |
| ; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 |
| ; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 56 |
| ; SSE4-NEXT: [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE4-NEXT: [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40 |
| ; SSE4-NEXT: [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32 |
| ; SSE4-NEXT: [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24 |
| ; SSE4-NEXT: [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16 |
| ; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 8 |
| ; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i8 |
| ; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 56 |
| ; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE4-NEXT: [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40 |
| ; SSE4-NEXT: [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE4-NEXT: [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24 |
| ; SSE4-NEXT: [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16 |
| ; SSE4-NEXT: [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 8 |
| ; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8 |
| ; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i8 |
| ; SSE4-NEXT: [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_17_8_EXTRACT_TRUNC]], i64 0 |
| ; SSE4-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[A_SROA_16_8_EXTRACT_TRUNC]], i64 1 |
| ; SSE4-NEXT: [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[A_SROA_15_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[A_SROA_14_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[A_SROA_13_8_EXTRACT_TRUNC]], i64 4 |
| ; SSE4-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[A_SROA_12_8_EXTRACT_TRUNC]], i64 5 |
| ; SSE4-NEXT: [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[A_SROA_11_8_EXTRACT_TRUNC]], i64 6 |
| ; SSE4-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 7 |
| ; SSE4-NEXT: [[TMP33:%.*]] = lshr <8 x i8> [[TMP32]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP34:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 0 |
| ; SSE4-NEXT: [[TMP35:%.*]] = insertelement <8 x i8> [[TMP34]], i8 [[B_SROA_16_8_EXTRACT_TRUNC]], i64 1 |
| ; SSE4-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP35]], i8 [[B_SROA_15_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP37:%.*]] = insertelement <8 x i8> [[TMP36]], i8 [[B_SROA_14_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[B_SROA_13_8_EXTRACT_TRUNC]], i64 4 |
| ; SSE4-NEXT: [[TMP39:%.*]] = insertelement <8 x i8> [[TMP38]], i8 [[B_SROA_12_8_EXTRACT_TRUNC]], i64 5 |
| ; SSE4-NEXT: [[TMP40:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[B_SROA_11_8_EXTRACT_TRUNC]], i64 6 |
| ; SSE4-NEXT: [[TMP41:%.*]] = insertelement <8 x i8> [[TMP40]], i8 [[B_SROA_17_8_EXTRACT_TRUNC]], i64 7 |
| ; SSE4-NEXT: [[TMP42:%.*]] = lshr <8 x i8> [[TMP41]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP43:%.*]] = add nuw <8 x i8> [[TMP42]], [[TMP33]] |
| ; SSE4-NEXT: [[TMP44:%.*]] = or <8 x i8> [[TMP41]], [[TMP32]] |
| ; SSE4-NEXT: [[TMP45:%.*]] = and <8 x i8> [[TMP44]], splat (i8 1) |
| ; SSE4-NEXT: [[TMP46:%.*]] = add nuw <8 x i8> [[TMP43]], [[TMP45]] |
| ; SSE4-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i64> |
| ; SSE4-NEXT: [[TMP48:%.*]] = shl nuw <8 x i64> [[TMP47]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; SSE4-NEXT: [[TMP49:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP48]]) |
| ; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP49]], 1 |
| ; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX-LABEL: @avgr_16_u8_alt( |
| ; AVX-NEXT: entry: |
| ; AVX-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0 |
| ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> poison, <8 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[TMP1]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i8> |
| ; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i64> [[TMP5]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP7:%.*]] = trunc <8 x i64> [[TMP6]] to <8 x i8> |
| ; AVX-NEXT: [[TMP8:%.*]] = lshr <8 x i8> [[TMP3]], splat (i8 1) |
| ; AVX-NEXT: [[TMP9:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1) |
| ; AVX-NEXT: [[TMP10:%.*]] = add nuw <8 x i8> [[TMP9]], [[TMP8]] |
| ; AVX-NEXT: [[TMP11:%.*]] = or <8 x i8> [[TMP7]], [[TMP3]] |
| ; AVX-NEXT: [[TMP12:%.*]] = and <8 x i8> [[TMP11]], splat (i8 1) |
| ; AVX-NEXT: [[TMP13:%.*]] = add nuw <8 x i8> [[TMP10]], [[TMP12]] |
| ; AVX-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i64> |
| ; AVX-NEXT: [[TMP15:%.*]] = shl nuw <8 x i64> [[TMP14]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP15]]) |
| ; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0 |
| ; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0 |
| ; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> poison, <8 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP19:%.*]] = lshr <8 x i64> [[TMP18]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP20:%.*]] = trunc <8 x i64> [[TMP19]] to <8 x i8> |
| ; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0 |
| ; AVX-NEXT: [[TMP22:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> poison, <8 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP23:%.*]] = lshr <8 x i64> [[TMP22]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP24:%.*]] = trunc <8 x i64> [[TMP23]] to <8 x i8> |
| ; AVX-NEXT: [[TMP25:%.*]] = lshr <8 x i8> [[TMP20]], splat (i8 1) |
| ; AVX-NEXT: [[TMP26:%.*]] = lshr <8 x i8> [[TMP24]], splat (i8 1) |
| ; AVX-NEXT: [[TMP27:%.*]] = add nuw <8 x i8> [[TMP26]], [[TMP25]] |
| ; AVX-NEXT: [[TMP28:%.*]] = or <8 x i8> [[TMP24]], [[TMP20]] |
| ; AVX-NEXT: [[TMP29:%.*]] = and <8 x i8> [[TMP28]], splat (i8 1) |
| ; AVX-NEXT: [[TMP30:%.*]] = add nuw <8 x i8> [[TMP27]], [[TMP29]] |
| ; AVX-NEXT: [[TMP31:%.*]] = zext <8 x i8> [[TMP30]] to <8 x i64> |
| ; AVX-NEXT: [[TMP32:%.*]] = shl nuw <8 x i64> [[TMP31]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56> |
| ; AVX-NEXT: [[TMP33:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP32]]) |
| ; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1 |
| ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| entry: |
| %retval = alloca %"struct.std::array16", align 1 |
| %a = alloca %"struct.std::array16", align 1 |
| %b = alloca %"struct.std::array16", align 1 |
| store i64 %a.coerce0, ptr %a, align 1 |
| %0 = getelementptr inbounds nuw i8, ptr %a, i64 8 |
| store i64 %a.coerce1, ptr %0, align 1 |
| store i64 %b.coerce0, ptr %b, align 1 |
| %1 = getelementptr inbounds nuw i8, ptr %b, i64 8 |
| store i64 %b.coerce1, ptr %1, align 1 |
| br label %for.cond |
| |
| for.cond: ; preds = %for.body, %entry |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %cmp = icmp samesign ult i64 %i.0, 16 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.cond |
| %.fca.0.load = load i64, ptr %retval, align 1 |
| %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0 |
| %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8 |
| %.fca.1.load = load i64, ptr %.fca.1.gep, align 1 |
| %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1 |
| ret { i64, i64 } %.fca.1.insert |
| |
| for.body: ; preds = %for.cond |
| %arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0 |
| %2 = load i8, ptr %arrayidx.i, align 1 |
| %arrayidx.i22 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0 |
| %3 = load i8, ptr %arrayidx.i22, align 1 |
| %shr = lshr i8 %2, 1 |
| %shr5 = lshr i8 %3, 1 |
| %narrow = add nuw i8 %shr, %shr5 |
| %or21 = or i8 %2, %3 |
| %4 = and i8 %or21, 1 |
| %add12 = add i8 %narrow, %4 |
| %arrayidx.i23 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0 |
| store i8 %add12, ptr %arrayidx.i23, align 1 |
| %inc = add nuw nsw i64 %i.0, 1 |
| br label %for.cond |
| } |
| |
| define { i64, i64 } @avgr_8_u16(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) { |
| ; SSE2-LABEL: @avgr_8_u16( |
| ; SSE2-NEXT: entry: |
| ; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32 |
| ; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32 |
| ; SSE2-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32 |
| ; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; SSE2-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32> |
| ; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0 |
| ; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1 |
| ; SSE2-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535) |
| ; SSE2-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535) |
| ; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 |
| ; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP2]], i64 1 |
| ; SSE2-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16) |
| ; SSE2-NEXT: [[CONV2_4:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE2-NEXT: [[TMP20:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16) |
| ; SSE2-NEXT: [[CONV_6:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE2-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; SSE2-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP8]] |
| ; SSE2-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1) |
| ; SSE2-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1) |
| ; SSE2-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP20]] |
| ; SSE2-NEXT: [[CONV_7:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535 |
| ; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535 |
| ; SSE2-NEXT: [[ADD_4:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1 |
| ; SSE2-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[CONV_7]], 1 |
| ; SSE2-NEXT: [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]] |
| ; SSE2-NEXT: [[ADD3_4:%.*]] = add nuw nsw i64 [[ADD_4]], [[CONV2_4]] |
| ; SSE2-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1 |
| ; SSE2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1 |
| ; SSE2-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_6]], [[B_SROA_9_8_EXTRACT_SHIFT]] |
| ; SSE2-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_7]], [[CONV2_2]] |
| ; SSE2-NEXT: [[TMP12:%.*]] = shl nuw i64 [[ADD3_7]], 47 |
| ; SSE2-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31 |
| ; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP12]], -281474976710656 |
| ; SSE2-NEXT: [[SHR_4:%.*]] = and i64 [[TMP9]], 281470681743360 |
| ; SSE2-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[ADD3_3]], 31 |
| ; SSE2-NEXT: [[TMP21:%.*]] = shl nuw i64 [[ADD3_4]], 47 |
| ; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP13]], 281470681743360 |
| ; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = and i64 [[TMP21]], -281474976710656 |
| ; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] |
| ; SSE2-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[SHR_4]] |
| ; SSE2-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15) |
| ; SSE2-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536) |
| ; SSE2-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64> |
| ; SSE2-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], i64 0 |
| ; SSE2-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1 |
| ; SSE2-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]] |
| ; SSE2-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]] |
| ; SSE2-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0 |
| ; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0 |
| ; SSE2-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1 |
| ; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1 |
| ; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; SSE4-LABEL: @avgr_8_u16( |
| ; SSE4-NEXT: entry: |
| ; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32 |
| ; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; SSE4-NEXT: [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32 |
| ; SSE4-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32 |
| ; SSE4-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; SSE4-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; SSE4-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32> |
| ; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE4-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0 |
| ; SSE4-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1 |
| ; SSE4-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535) |
| ; SSE4-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535) |
| ; SSE4-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 |
| ; SSE4-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1 |
| ; SSE4-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16) |
| ; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48 |
| ; SSE4-NEXT: [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16) |
| ; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; SSE4-NEXT: [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; SSE4-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; SSE4-NEXT: [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]] |
| ; SSE4-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1) |
| ; SSE4-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1) |
| ; SSE4-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]] |
| ; SSE4-NEXT: [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535 |
| ; SSE4-NEXT: [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535 |
| ; SSE4-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1 |
| ; SSE4-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1 |
| ; SSE4-NEXT: [[ADD3_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV2_6]] |
| ; SSE4-NEXT: [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]] |
| ; SSE4-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1 |
| ; SSE4-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1 |
| ; SSE4-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]] |
| ; SSE4-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]] |
| ; SSE4-NEXT: [[TMP18:%.*]] = shl nuw i64 [[ADD3_7]], 47 |
| ; SSE4-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31 |
| ; SSE4-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP18]], -281474976710656 |
| ; SSE4-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = and i64 [[TMP19]], 281470681743360 |
| ; SSE4-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[ADD3_6]], 31 |
| ; SSE4-NEXT: [[TMP21:%.*]] = shl nuw i64 [[ADD3_3]], 47 |
| ; SSE4-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP20]], 281470681743360 |
| ; SSE4-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = and i64 [[TMP21]], -281474976710656 |
| ; SSE4-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] |
| ; SSE4-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_EXT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] |
| ; SSE4-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15) |
| ; SSE4-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536) |
| ; SSE4-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64> |
| ; SSE4-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], i64 0 |
| ; SSE4-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1 |
| ; SSE4-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]] |
| ; SSE4-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]] |
| ; SSE4-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0 |
| ; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0 |
| ; SSE4-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1 |
| ; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1 |
| ; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX2-LABEL: @avgr_8_u16( |
| ; AVX2-NEXT: entry: |
| ; AVX2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32 |
| ; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48 |
| ; AVX2-NEXT: [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32 |
| ; AVX2-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; AVX2-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[A_COERCE1]], i64 1 |
| ; AVX2-NEXT: [[TMP21:%.*]] = lshr <2 x i64> [[TMP20]], <i64 48, i64 32> |
| ; AVX2-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; AVX2-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32> |
| ; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; AVX2-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[A_COERCE0]], i64 0 |
| ; AVX2-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535) |
| ; AVX2-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535) |
| ; AVX2-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 |
| ; AVX2-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1 |
| ; AVX2-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16) |
| ; AVX2-NEXT: [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16) |
| ; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48 |
| ; AVX2-NEXT: [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48 |
| ; AVX2-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535 |
| ; AVX2-NEXT: [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1) |
| ; AVX2-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]] |
| ; AVX2-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1) |
| ; AVX2-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1) |
| ; AVX2-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]] |
| ; AVX2-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP21]], <i64 -1, i64 65535> |
| ; AVX2-NEXT: [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535 |
| ; AVX2-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1 |
| ; AVX2-NEXT: [[TMP37:%.*]] = add nuw nsw <2 x i64> [[TMP34]], <i64 0, i64 1> |
| ; AVX2-NEXT: [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[ADD_3]], i64 0 |
| ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[CONV2_6]], i64 1 |
| ; AVX2-NEXT: [[TMP38:%.*]] = add nuw nsw <2 x i64> [[TMP25]], [[TMP37]] |
| ; AVX2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1 |
| ; AVX2-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1 |
| ; AVX2-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]] |
| ; AVX2-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]] |
| ; AVX2-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[ADD3_2]], i64 0 |
| ; AVX2-NEXT: [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[ADD3_7]], i64 1 |
| ; AVX2-NEXT: [[TMP41:%.*]] = shl nuw <2 x i64> [[TMP40]], <i64 31, i64 47> |
| ; AVX2-NEXT: [[TMP31:%.*]] = and <2 x i64> [[TMP41]], <i64 281470681743360, i64 -281474976710656> |
| ; AVX2-NEXT: [[TMP32:%.*]] = shl nuw <2 x i64> [[TMP38]], <i64 47, i64 31> |
| ; AVX2-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP32]], <i64 -281474976710656, i64 281470681743360> |
| ; AVX2-NEXT: [[TMP26:%.*]] = or disjoint <2 x i64> [[TMP31]], [[TMP33]] |
| ; AVX2-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15) |
| ; AVX2-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536) |
| ; AVX2-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64> |
| ; AVX2-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]] |
| ; AVX2-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]] |
| ; AVX2-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0 |
| ; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0 |
| ; AVX2-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1 |
| ; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1 |
| ; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX512-LABEL: @avgr_8_u16( |
| ; AVX512-NEXT: entry: |
| ; AVX512-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32 |
| ; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0 |
| ; AVX512-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[B_COERCE1:%.*]], i64 1 |
| ; AVX512-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], <i64 48, i64 32> |
| ; AVX512-NEXT: [[TMP4:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32 |
| ; AVX512-NEXT: [[TMP31:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; AVX512-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> [[TMP31]], i64 [[A_COERCE1]], i64 1 |
| ; AVX512-NEXT: [[TMP49:%.*]] = lshr <2 x i64> [[TMP32]], <i64 48, i64 32> |
| ; AVX512-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE0]], i64 0 |
| ; AVX512-NEXT: [[TMP6:%.*]] = trunc <2 x i64> [[TMP5]] to <2 x i32> |
| ; AVX512-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[A_COERCE0]], i64 0 |
| ; AVX512-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 65535) |
| ; AVX512-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP5]], splat (i64 65535) |
| ; AVX512-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP7]], <i64 32, i64 0> |
| ; AVX512-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 |
| ; AVX512-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i64 1 |
| ; AVX512-NEXT: [[TMP13:%.*]] = lshr <2 x i32> [[TMP12]], splat (i32 16) |
| ; AVX512-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP5]], <i64 32, i64 0> |
| ; AVX512-NEXT: [[TMP15:%.*]] = lshr <2 x i32> [[TMP6]], splat (i32 16) |
| ; AVX512-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP10]], <i64 65535, i64 poison> |
| ; AVX512-NEXT: [[TMP17:%.*]] = lshr <2 x i64> [[TMP10]], <i64 65535, i64 48> |
| ; AVX512-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP17]], <2 x i32> <i32 0, i32 3> |
| ; AVX512-NEXT: [[TMP19:%.*]] = and <2 x i64> [[TMP14]], <i64 65535, i64 poison> |
| ; AVX512-NEXT: [[TMP20:%.*]] = lshr <2 x i64> [[TMP14]], <i64 65535, i64 48> |
| ; AVX512-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP19]], <2 x i64> [[TMP20]], <2 x i32> <i32 0, i32 3> |
| ; AVX512-NEXT: [[TMP22:%.*]] = add nuw nsw <2 x i64> [[TMP8]], splat (i64 1) |
| ; AVX512-NEXT: [[TMP23:%.*]] = add nuw nsw <2 x i64> [[TMP22]], [[TMP9]] |
| ; AVX512-NEXT: [[TMP24:%.*]] = lshr <2 x i64> [[TMP23]], splat (i64 1) |
| ; AVX512-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i32> [[TMP13]], splat (i32 1) |
| ; AVX512-NEXT: [[TMP26:%.*]] = add nuw nsw <2 x i32> [[TMP25]], [[TMP15]] |
| ; AVX512-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP49]], <i64 -1, i64 65535> |
| ; AVX512-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i64> [[TMP3]], <i64 1, i64 poison> |
| ; AVX512-NEXT: [[TMP28:%.*]] = and <2 x i64> [[TMP3]], <i64 poison, i64 65535> |
| ; AVX512-NEXT: [[TMP29:%.*]] = shufflevector <2 x i64> [[TMP27]], <2 x i64> [[TMP28]], <2 x i32> <i32 0, i32 3> |
| ; AVX512-NEXT: [[TMP33:%.*]] = add nuw nsw <2 x i64> [[TMP30]], <i64 0, i64 1> |
| ; AVX512-NEXT: [[TMP34:%.*]] = add nuw nsw <2 x i64> [[TMP29]], [[TMP33]] |
| ; AVX512-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP18]], splat (i64 1) |
| ; AVX512-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP21]] |
| ; AVX512-NEXT: [[TMP37:%.*]] = shl nuw <2 x i64> [[TMP36]], <i64 31, i64 47> |
| ; AVX512-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP37]], <i64 281470681743360, i64 -281474976710656> |
| ; AVX512-NEXT: [[TMP39:%.*]] = shl nuw <2 x i64> [[TMP34]], <i64 47, i64 31> |
| ; AVX512-NEXT: [[TMP40:%.*]] = and <2 x i64> [[TMP39]], <i64 -281474976710656, i64 281470681743360> |
| ; AVX512-NEXT: [[TMP41:%.*]] = or disjoint <2 x i64> [[TMP38]], [[TMP40]] |
| ; AVX512-NEXT: [[TMP42:%.*]] = shl nuw <2 x i32> [[TMP26]], splat (i32 15) |
| ; AVX512-NEXT: [[TMP43:%.*]] = and <2 x i32> [[TMP42]], splat (i32 -65536) |
| ; AVX512-NEXT: [[TMP44:%.*]] = zext <2 x i32> [[TMP43]] to <2 x i64> |
| ; AVX512-NEXT: [[TMP45:%.*]] = or disjoint <2 x i64> [[TMP41]], [[TMP44]] |
| ; AVX512-NEXT: [[TMP46:%.*]] = or disjoint <2 x i64> [[TMP45]], [[TMP24]] |
| ; AVX512-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0 |
| ; AVX512-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP47]], 0 |
| ; AVX512-NEXT: [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1 |
| ; AVX512-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP48]], 1 |
| ; AVX512-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| entry: |
| %retval = alloca %"struct.std::array8", align 2 |
| %a = alloca %"struct.std::array8", align 2 |
| %b = alloca %"struct.std::array8", align 2 |
| store i64 %a.coerce0, ptr %a, align 2 |
| %0 = getelementptr inbounds nuw i8, ptr %a, i64 8 |
| store i64 %a.coerce1, ptr %0, align 2 |
| store i64 %b.coerce0, ptr %b, align 2 |
| %1 = getelementptr inbounds nuw i8, ptr %b, i64 8 |
| store i64 %b.coerce1, ptr %1, align 2 |
| br label %for.cond |
| |
| for.cond: ; preds = %for.body, %entry |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %cmp = icmp samesign ult i64 %i.0, 8 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.cond |
| %.fca.0.load = load i64, ptr %retval, align 2 |
| %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0 |
| %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8 |
| %.fca.1.load = load i64, ptr %.fca.1.gep, align 2 |
| %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1 |
| ret { i64, i64 } %.fca.1.insert |
| |
| for.body: ; preds = %for.cond |
| %arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0 |
| %2 = load i16, ptr %arrayidx.i, align 2 |
| %conv = zext i16 %2 to i32 |
| %arrayidx.i10 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0 |
| %3 = load i16, ptr %arrayidx.i10, align 2 |
| %conv2 = zext i16 %3 to i32 |
| %add = add nuw nsw i32 %conv, %conv2 |
| %add3 = add nuw nsw i32 %add, 1 |
| %shr = lshr i32 %add3, 1 |
| %conv4 = trunc nuw i32 %shr to i16 |
| %arrayidx.i11 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0 |
| store i16 %conv4, ptr %arrayidx.i11, align 2 |
| %inc = add nuw nsw i64 %i.0, 1 |
| br label %for.cond |
| } |
| |
| define { i64, i64 } @avgr_8_u16_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) { |
| ; SSE2-LABEL: @avgr_8_u16_alt( |
| ; SSE2-NEXT: entry: |
| ; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48 |
| ; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16 |
| ; SSE2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48 |
| ; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16 |
| ; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW:%.*]] = add nuw i16 [[SHR5_3]], [[SHR5]] |
| ; SSE2-NEXT: [[NARROW_1:%.*]] = add nuw i16 [[SHR5_2]], [[SHR5_1]] |
| ; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0]] to i16 |
| ; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 |
| ; SSE2-NEXT: [[TMP17:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP14]], i16 [[TMP17]], i64 1 |
| ; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0]] to i16 |
| ; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0 |
| ; SSE2-NEXT: [[TMP6:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP6]], i64 1 |
| ; SSE2-NEXT: [[TMP19:%.*]] = lshr <2 x i16> [[TMP18]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP21:%.*]] = lshr <2 x i16> [[TMP7]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP10:%.*]] = add nuw <2 x i16> [[TMP21]], [[TMP19]] |
| ; SSE2-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> [[TMP37]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE2-NEXT: [[TMP39:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE2-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP39]], [[TMP3]] |
| ; SSE2-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2 |
| ; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3 |
| ; SSE2-NEXT: [[TMP15:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]] |
| ; SSE2-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64 |
| ; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0 |
| ; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48 |
| ; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE2-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16 |
| ; SSE2-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2:%.*]], 48 |
| ; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT1:%.*]] = lshr i64 [[B_COERCE2]], 32 |
| ; SSE2-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2]], 16 |
| ; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT1]] to i16 |
| ; SSE2-NEXT: [[SHR_6:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR_7:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1 |
| ; SSE2-NEXT: [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]] |
| ; SSE2-NEXT: [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]] |
| ; SSE2-NEXT: [[TMP40:%.*]] = trunc i64 [[B_COERCE1]] to i16 |
| ; SSE2-NEXT: [[TMP41:%.*]] = insertelement <2 x i16> poison, i16 [[TMP40]], i64 0 |
| ; SSE2-NEXT: [[TMP42:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP41]], i16 [[TMP42]], i64 1 |
| ; SSE2-NEXT: [[TMP28:%.*]] = trunc i64 [[B_COERCE2]] to i16 |
| ; SSE2-NEXT: [[TMP29:%.*]] = insertelement <2 x i16> poison, i16 [[TMP28]], i64 0 |
| ; SSE2-NEXT: [[TMP45:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; SSE2-NEXT: [[TMP46:%.*]] = insertelement <2 x i16> [[TMP29]], i16 [[TMP45]], i64 1 |
| ; SSE2-NEXT: [[TMP32:%.*]] = lshr <2 x i16> [[TMP27]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP47:%.*]] = lshr <2 x i16> [[TMP46]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP34:%.*]] = add nuw <2 x i16> [[TMP47]], [[TMP32]] |
| ; SSE2-NEXT: [[TMP35:%.*]] = shufflevector <2 x i16> [[TMP46]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE2-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE2-NEXT: [[TMP22:%.*]] = shufflevector <2 x i16> [[TMP27]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE2-NEXT: [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]] |
| ; SSE2-NEXT: [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1) |
| ; SSE2-NEXT: [[TMP43:%.*]] = shufflevector <2 x i16> [[TMP34]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE2-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP43]], i16 [[NARROW_7]], i64 2 |
| ; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[NARROW_6]], i64 3 |
| ; SSE2-NEXT: [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]] |
| ; SSE2-NEXT: [[TMP33:%.*]] = bitcast <4 x i16> [[TMP31]] to i64 |
| ; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1 |
| ; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; SSE4-LABEL: @avgr_8_u16_alt( |
| ; SSE4-NEXT: entry: |
| ; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48 |
| ; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16 |
| ; SSE4-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48 |
| ; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16 |
| ; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[NARROW:%.*]] = add nuw i16 [[SHR5_3]], [[SHR5]] |
| ; SSE4-NEXT: [[NARROW_1:%.*]] = add nuw i16 [[SHR5_2]], [[SHR5_1]] |
| ; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0]] to i16 |
| ; SSE4-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 |
| ; SSE4-NEXT: [[TMP17:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP14]], i16 [[TMP17]], i64 1 |
| ; SSE4-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0]] to i16 |
| ; SSE4-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0 |
| ; SSE4-NEXT: [[TMP6:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP6]], i64 1 |
| ; SSE4-NEXT: [[TMP19:%.*]] = lshr <2 x i16> [[TMP18]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP21:%.*]] = lshr <2 x i16> [[TMP7]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP10:%.*]] = add nuw <2 x i16> [[TMP21]], [[TMP19]] |
| ; SSE4-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> [[TMP37]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP39:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP39]], [[TMP3]] |
| ; SSE4-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2 |
| ; SSE4-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3 |
| ; SSE4-NEXT: [[TMP15:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]] |
| ; SSE4-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64 |
| ; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0 |
| ; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48 |
| ; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; SSE4-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16 |
| ; SSE4-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2:%.*]], 48 |
| ; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT1:%.*]] = lshr i64 [[B_COERCE2]], 32 |
| ; SSE4-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2]], 16 |
| ; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT1]] to i16 |
| ; SSE4-NEXT: [[SHR_6:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR_7:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1 |
| ; SSE4-NEXT: [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]] |
| ; SSE4-NEXT: [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]] |
| ; SSE4-NEXT: [[TMP40:%.*]] = trunc i64 [[B_COERCE1]] to i16 |
| ; SSE4-NEXT: [[TMP41:%.*]] = insertelement <2 x i16> poison, i16 [[TMP40]], i64 0 |
| ; SSE4-NEXT: [[TMP42:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP41]], i16 [[TMP42]], i64 1 |
| ; SSE4-NEXT: [[TMP28:%.*]] = trunc i64 [[B_COERCE2]] to i16 |
| ; SSE4-NEXT: [[TMP29:%.*]] = insertelement <2 x i16> poison, i16 [[TMP28]], i64 0 |
| ; SSE4-NEXT: [[TMP45:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; SSE4-NEXT: [[TMP46:%.*]] = insertelement <2 x i16> [[TMP29]], i16 [[TMP45]], i64 1 |
| ; SSE4-NEXT: [[TMP32:%.*]] = lshr <2 x i16> [[TMP27]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP47:%.*]] = lshr <2 x i16> [[TMP46]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP34:%.*]] = add nuw <2 x i16> [[TMP47]], [[TMP32]] |
| ; SSE4-NEXT: [[TMP35:%.*]] = shufflevector <2 x i16> [[TMP46]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP22:%.*]] = shufflevector <2 x i16> [[TMP27]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2 |
| ; SSE4-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3 |
| ; SSE4-NEXT: [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]] |
| ; SSE4-NEXT: [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1) |
| ; SSE4-NEXT: [[TMP43:%.*]] = shufflevector <2 x i16> [[TMP34]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> |
| ; SSE4-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP43]], i16 [[NARROW_7]], i64 2 |
| ; SSE4-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[NARROW_6]], i64 3 |
| ; SSE4-NEXT: [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]] |
| ; SSE4-NEXT: [[TMP33:%.*]] = bitcast <4 x i16> [[TMP31]] to i64 |
| ; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1 |
| ; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX2-LABEL: @avgr_8_u16_alt( |
| ; AVX2-NEXT: entry: |
| ; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48 |
| ; AVX2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32 |
| ; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16 |
| ; AVX2-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i16 |
| ; AVX2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48 |
| ; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32 |
| ; AVX2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16 |
| ; AVX2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i16 |
| ; AVX2-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 0 |
| ; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 1 |
| ; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 2 |
| ; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 3 |
| ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP5:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 0 |
| ; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 1 |
| ; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2 |
| ; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3 |
| ; AVX2-NEXT: [[TMP9:%.*]] = lshr <4 x i16> [[TMP8]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP4]] |
| ; AVX2-NEXT: [[TMP11:%.*]] = or <4 x i16> [[TMP8]], [[TMP3]] |
| ; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]] |
| ; AVX2-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 |
| ; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0 |
| ; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 48 |
| ; AVX2-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32 |
| ; AVX2-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16 |
| ; AVX2-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_8_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i16 |
| ; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48 |
| ; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32 |
| ; AVX2-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16 |
| ; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16 |
| ; AVX2-NEXT: [[B_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i16 |
| ; AVX2-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 0 |
| ; AVX2-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[A_SROA_8_8_EXTRACT_TRUNC]], i64 1 |
| ; AVX2-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2 |
| ; AVX2-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3 |
| ; AVX2-NEXT: [[TMP20:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_5_8_EXTRACT_TRUNC]], i64 0 |
| ; AVX2-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 1 |
| ; AVX2-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[B_SROA_7_8_EXTRACT_TRUNC]], i64 2 |
| ; AVX2-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 3 |
| ; AVX2-NEXT: [[TMP25:%.*]] = lshr <4 x i16> [[TMP24]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP20]] |
| ; AVX2-NEXT: [[TMP27:%.*]] = or <4 x i16> [[TMP24]], [[TMP19]] |
| ; AVX2-NEXT: [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1) |
| ; AVX2-NEXT: [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]] |
| ; AVX2-NEXT: [[TMP31:%.*]] = bitcast <4 x i16> [[TMP29]] to i64 |
| ; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1 |
| ; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| ; AVX512-LABEL: @avgr_8_u16_alt( |
| ; AVX512-NEXT: entry: |
| ; AVX512-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0 |
| ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer |
| ; AVX512-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[TMP1]], <i64 0, i64 16, i64 32, i64 48> |
| ; AVX512-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i16> |
| ; AVX512-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0 |
| ; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> zeroinitializer |
| ; AVX512-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP5]], <i64 0, i64 16, i64 32, i64 48> |
| ; AVX512-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i16> |
| ; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP9:%.*]] = lshr <4 x i16> [[TMP7]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP8]] |
| ; AVX512-NEXT: [[TMP11:%.*]] = or <4 x i16> [[TMP7]], [[TMP3]] |
| ; AVX512-NEXT: [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]] |
| ; AVX512-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 |
| ; AVX512-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0 |
| ; AVX512-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0 |
| ; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> zeroinitializer |
| ; AVX512-NEXT: [[TMP18:%.*]] = lshr <4 x i64> [[TMP17]], <i64 0, i64 16, i64 32, i64 48> |
| ; AVX512-NEXT: [[TMP19:%.*]] = trunc <4 x i64> [[TMP18]] to <4 x i16> |
| ; AVX512-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0 |
| ; AVX512-NEXT: [[TMP21:%.*]] = shufflevector <4 x i64> [[TMP20]], <4 x i64> poison, <4 x i32> zeroinitializer |
| ; AVX512-NEXT: [[TMP22:%.*]] = lshr <4 x i64> [[TMP21]], <i64 0, i64 16, i64 32, i64 48> |
| ; AVX512-NEXT: [[TMP23:%.*]] = trunc <4 x i64> [[TMP22]] to <4 x i16> |
| ; AVX512-NEXT: [[TMP24:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP25:%.*]] = lshr <4 x i16> [[TMP23]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP24]] |
| ; AVX512-NEXT: [[TMP27:%.*]] = or <4 x i16> [[TMP23]], [[TMP19]] |
| ; AVX512-NEXT: [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1) |
| ; AVX512-NEXT: [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]] |
| ; AVX512-NEXT: [[TMP31:%.*]] = bitcast <4 x i16> [[TMP29]] to i64 |
| ; AVX512-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1 |
| ; AVX512-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] |
| ; |
| entry: |
| %retval = alloca %"struct.std::array8", align 2 |
| %a = alloca %"struct.std::array8", align 2 |
| %b = alloca %"struct.std::array8", align 2 |
| store i64 %a.coerce0, ptr %a, align 2 |
| %0 = getelementptr inbounds nuw i8, ptr %a, i64 8 |
| store i64 %a.coerce1, ptr %0, align 2 |
| store i64 %b.coerce0, ptr %b, align 2 |
| %1 = getelementptr inbounds nuw i8, ptr %b, i64 8 |
| store i64 %b.coerce1, ptr %1, align 2 |
| br label %for.cond |
| |
| for.cond: ; preds = %for.body, %entry |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %cmp = icmp samesign ult i64 %i.0, 8 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.cond |
| %.fca.0.load = load i64, ptr %retval, align 2 |
| %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0 |
| %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8 |
| %.fca.1.load = load i64, ptr %.fca.1.gep, align 2 |
| %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1 |
| ret { i64, i64 } %.fca.1.insert |
| |
| for.body: ; preds = %for.cond |
| %arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0 |
| %2 = load i16, ptr %arrayidx.i, align 2 |
| %arrayidx.i22 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0 |
| %3 = load i16, ptr %arrayidx.i22, align 2 |
| %shr = lshr i16 %2, 1 |
| %shr5 = lshr i16 %3, 1 |
| %narrow = add nuw i16 %shr, %shr5 |
| %or21 = or i16 %2, %3 |
| %4 = and i16 %or21, 1 |
| %add12 = add i16 %narrow, %4 |
| %arrayidx.i23 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0 |
| store i16 %add12, ptr %arrayidx.i23, align 2 |
| %inc = add nuw nsw i64 %i.0, 1 |
| br label %for.cond |
| } |
| ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: |
| ; CHECK: {{.*}} |