blob: bad894d142fd2bfa116d4510dca1b1f289f1c163 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
; PR128424
%"struct.std::array8" = type { [8 x i16] }
%"struct.std::array16" = type { [16 x i8] }
define { i64, i64 } @avgr_16_u8(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
; SSE2-LABEL: @avgr_16_u8(
; SSE2-NEXT: entry:
; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
; SSE2-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1
; SSE2-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16)
; SSE2-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24)
; SSE2-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32)
; SSE2-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40)
; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
; SSE2-NEXT: [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16
; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE2-NEXT: [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
; SSE2-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0
; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1
; SSE2-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16)
; SSE2-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24)
; SSE2-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32)
; SSE2-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40)
; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE2-NEXT: [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16
; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE2-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255)
; SSE2-NEXT: [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255)
; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1
; SSE2-NEXT: [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8)
; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
; SSE2-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0
; SSE2-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1
; SSE2-NEXT: [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8)
; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
; SSE2-NEXT: [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255
; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
; SSE2-NEXT: [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255
; SSE2-NEXT: [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1)
; SSE2-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]]
; SSE2-NEXT: [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1)
; SSE2-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1)
; SSE2-NEXT: [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]]
; SSE2-NEXT: [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255)
; SSE2-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255)
; SSE2-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1)
; SSE2-NEXT: [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]]
; SSE2-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255)
; SSE2-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255)
; SSE2-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1)
; SSE2-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]]
; SSE2-NEXT: [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255)
; SSE2-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255)
; SSE2-NEXT: [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1)
; SSE2-NEXT: [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]]
; SSE2-NEXT: [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255)
; SSE2-NEXT: [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255)
; SSE2-NEXT: [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1)
; SSE2-NEXT: [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]]
; SSE2-NEXT: [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255
; SSE2-NEXT: [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255
; SSE2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1
; SSE2-NEXT: [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1
; SSE2-NEXT: [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]]
; SSE2-NEXT: [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]]
; SSE2-NEXT: [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1
; SSE2-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1
; SSE2-NEXT: [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]]
; SSE2-NEXT: [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]]
; SSE2-NEXT: [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55
; SSE2-NEXT: [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47
; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936
; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280
; SSE2-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47
; SSE2-NEXT: [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55
; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280
; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936
; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
; SSE2-NEXT: [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39)
; SSE2-NEXT: [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880)
; SSE2-NEXT: [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0
; SSE2-NEXT: [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1
; SSE2-NEXT: [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]]
; SSE2-NEXT: [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31)
; SSE2-NEXT: [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480)
; SSE2-NEXT: [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]]
; SSE2-NEXT: [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23)
; SSE2-NEXT: [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080)
; SSE2-NEXT: [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]]
; SSE2-NEXT: [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15)
; SSE2-NEXT: [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680)
; SSE2-NEXT: [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7)
; SSE2-NEXT: [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]]
; SSE2-NEXT: [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256)
; SSE2-NEXT: [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64>
; SSE2-NEXT: [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]]
; SSE2-NEXT: [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]]
; SSE2-NEXT: [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0
; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0
; SSE2-NEXT: [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1
; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1
; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; SSE4-LABEL: @avgr_16_u8(
; SSE4-NEXT: entry:
; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
; SSE4-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
; SSE4-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1
; SSE4-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16)
; SSE4-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24)
; SSE4-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32)
; SSE4-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40)
; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
; SSE4-NEXT: [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16
; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE4-NEXT: [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
; SSE4-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0
; SSE4-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1
; SSE4-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16)
; SSE4-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24)
; SSE4-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32)
; SSE4-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40)
; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE4-NEXT: [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16
; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE4-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255)
; SSE4-NEXT: [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255)
; SSE4-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
; SSE4-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1
; SSE4-NEXT: [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8)
; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
; SSE4-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0
; SSE4-NEXT: [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1
; SSE4-NEXT: [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8)
; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
; SSE4-NEXT: [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255
; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
; SSE4-NEXT: [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255
; SSE4-NEXT: [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1)
; SSE4-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]]
; SSE4-NEXT: [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1)
; SSE4-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1)
; SSE4-NEXT: [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]]
; SSE4-NEXT: [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255)
; SSE4-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255)
; SSE4-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1)
; SSE4-NEXT: [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]]
; SSE4-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255)
; SSE4-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255)
; SSE4-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1)
; SSE4-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]]
; SSE4-NEXT: [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255)
; SSE4-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255)
; SSE4-NEXT: [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1)
; SSE4-NEXT: [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]]
; SSE4-NEXT: [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255)
; SSE4-NEXT: [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255)
; SSE4-NEXT: [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1)
; SSE4-NEXT: [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]]
; SSE4-NEXT: [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255
; SSE4-NEXT: [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255
; SSE4-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1
; SSE4-NEXT: [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1
; SSE4-NEXT: [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]]
; SSE4-NEXT: [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]]
; SSE4-NEXT: [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1
; SSE4-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1
; SSE4-NEXT: [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]]
; SSE4-NEXT: [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]]
; SSE4-NEXT: [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55
; SSE4-NEXT: [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47
; SSE4-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936
; SSE4-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280
; SSE4-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47
; SSE4-NEXT: [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55
; SSE4-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280
; SSE4-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936
; SSE4-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
; SSE4-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
; SSE4-NEXT: [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39)
; SSE4-NEXT: [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880)
; SSE4-NEXT: [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0
; SSE4-NEXT: [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1
; SSE4-NEXT: [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]]
; SSE4-NEXT: [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31)
; SSE4-NEXT: [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480)
; SSE4-NEXT: [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]]
; SSE4-NEXT: [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23)
; SSE4-NEXT: [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080)
; SSE4-NEXT: [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]]
; SSE4-NEXT: [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15)
; SSE4-NEXT: [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680)
; SSE4-NEXT: [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7)
; SSE4-NEXT: [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]]
; SSE4-NEXT: [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256)
; SSE4-NEXT: [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64>
; SSE4-NEXT: [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]]
; SSE4-NEXT: [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]]
; SSE4-NEXT: [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0
; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0
; SSE4-NEXT: [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1
; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1
; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX-LABEL: @avgr_16_u8(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
; AVX-NEXT: [[TMP1:%.*]] = lshr i16 [[TMP0]], 8
; AVX-NEXT: [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i16
; AVX-NEXT: [[TMP3:%.*]] = lshr i16 [[TMP2]], 8
; AVX-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
; AVX-NEXT: [[TMP5:%.*]] = lshr i16 [[TMP4]], 8
; AVX-NEXT: [[TMP6:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i16
; AVX-NEXT: [[TMP7:%.*]] = lshr i16 [[TMP6]], 8
; AVX-NEXT: [[CONV1:%.*]] = and i64 [[A_COERCE0]], 255
; AVX-NEXT: [[CONV4:%.*]] = and i64 [[B_COERCE0]], 255
; AVX-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[CONV1]], 1
; AVX-NEXT: [[ADD5:%.*]] = add nuw nsw i64 [[ADD]], [[CONV4]]
; AVX-NEXT: [[ADD_1:%.*]] = add nuw nsw i16 [[TMP1]], 1
; AVX-NEXT: [[ADD5_1:%.*]] = add nuw nsw i16 [[ADD_1]], [[TMP5]]
; AVX-NEXT: [[CONV1_8:%.*]] = and i64 [[A_COERCE1]], 255
; AVX-NEXT: [[CONV4_8:%.*]] = and i64 [[B_COERCE1]], 255
; AVX-NEXT: [[ADD_8:%.*]] = add nuw nsw i64 [[CONV1_8]], 1
; AVX-NEXT: [[ADD5_8:%.*]] = add nuw nsw i64 [[ADD_8]], [[CONV4_8]]
; AVX-NEXT: [[ADD_9:%.*]] = add nuw nsw i16 [[TMP3]], 1
; AVX-NEXT: [[ADD5_9:%.*]] = add nuw nsw i16 [[ADD_9]], [[TMP7]]
; AVX-NEXT: [[TMP8:%.*]] = shl nuw i16 [[ADD5_1]], 7
; AVX-NEXT: [[TMP9:%.*]] = and i16 [[TMP8]], -256
; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE0]], i64 0
; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
; AVX-NEXT: [[TMP12:%.*]] = lshr <8 x i64> [[TMP11]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
; AVX-NEXT: [[TMP13:%.*]] = and <8 x i64> [[TMP12]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP9]] to i64
; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0]], i64 0
; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i64> [[TMP14]], i64 [[ADD5]], i64 6
; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i64> [[TMP15]], i64 [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED]], i64 7
; AVX-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
; AVX-NEXT: [[TMP18:%.*]] = lshr <8 x i64> [[TMP17]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
; AVX-NEXT: [[TMP19:%.*]] = and <8 x i64> [[TMP18]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
; AVX-NEXT: [[TMP20:%.*]] = add nuw nsw <8 x i64> [[TMP13]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
; AVX-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i64> [[TMP19]], [[TMP20]]
; AVX-NEXT: [[TMP22:%.*]] = shl nuw <8 x i64> [[TMP21]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
; AVX-NEXT: [[TMP23:%.*]] = and <8 x i64> [[TMP22]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
; AVX-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]])
; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
; AVX-NEXT: [[TMP25:%.*]] = shl nuw i16 [[ADD5_9]], 7
; AVX-NEXT: [[TMP26:%.*]] = and i16 [[TMP25]], -256
; AVX-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE1]], i64 0
; AVX-NEXT: [[TMP28:%.*]] = shufflevector <8 x i64> [[TMP27]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
; AVX-NEXT: [[TMP29:%.*]] = lshr <8 x i64> [[TMP28]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
; AVX-NEXT: [[TMP30:%.*]] = and <8 x i64> [[TMP29]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
; AVX-NEXT: [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP26]] to i64
; AVX-NEXT: [[TMP31:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1]], i64 0
; AVX-NEXT: [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[ADD5_8]], i64 6
; AVX-NEXT: [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED]], i64 7
; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
; AVX-NEXT: [[TMP35:%.*]] = lshr <8 x i64> [[TMP34]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
; AVX-NEXT: [[TMP36:%.*]] = and <8 x i64> [[TMP35]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
; AVX-NEXT: [[TMP37:%.*]] = add nuw nsw <8 x i64> [[TMP30]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
; AVX-NEXT: [[TMP38:%.*]] = add nuw nsw <8 x i64> [[TMP36]], [[TMP37]]
; AVX-NEXT: [[TMP39:%.*]] = shl nuw <8 x i64> [[TMP38]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
; AVX-NEXT: [[TMP40:%.*]] = and <8 x i64> [[TMP39]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
; AVX-NEXT: [[TMP41:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP40]])
; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP41]], 1
; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
entry:
%retval = alloca %"struct.std::array16", align 1
%a = alloca %"struct.std::array16", align 1
%b = alloca %"struct.std::array16", align 1
store i64 %a.coerce0, ptr %a, align 1
%0 = getelementptr inbounds nuw i8, ptr %a, i64 8
store i64 %a.coerce1, ptr %0, align 1
store i64 %b.coerce0, ptr %b, align 1
%1 = getelementptr inbounds nuw i8, ptr %b, i64 8
store i64 %b.coerce1, ptr %1, align 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp samesign ult i64 %i.0, 16
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
%.fca.0.load = load i64, ptr %retval, align 1
%.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
%.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
%.fca.1.load = load i64, ptr %.fca.1.gep, align 1
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
ret { i64, i64 } %.fca.1.insert
for.body: ; preds = %for.cond
%arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0
%2 = load i8, ptr %arrayidx.i, align 1
%conv1 = zext i8 %2 to i16
%arrayidx.i12 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0
%3 = load i8, ptr %arrayidx.i12, align 1
%conv4 = zext i8 %3 to i16
%add = add nuw nsw i16 %conv1, %conv4
%add5 = add nuw nsw i16 %add, 1
%shr = lshr i16 %add5, 1
%conv6 = trunc i16 %shr to i8
%arrayidx.i13 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0
store i8 %conv6, ptr %arrayidx.i13, align 1
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond
}
define { i64, i64 } @avgr_16_u8_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
; SSE2-LABEL: @avgr_16_u8_alt(
; SSE2-NEXT: entry:
; SSE2-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i8
; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 8
; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24
; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE2-NEXT: [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40
; SSE2-NEXT: [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE2-NEXT: [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
; SSE2-NEXT: [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i8
; SSE2-NEXT: [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 8
; SSE2-NEXT: [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16
; SSE2-NEXT: [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24
; SSE2-NEXT: [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
; SSE2-NEXT: [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40
; SSE2-NEXT: [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE2-NEXT: [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
; SSE2-NEXT: [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i8
; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 8
; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24
; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE2-NEXT: [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40
; SSE2-NEXT: [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE2-NEXT: [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
; SSE2-NEXT: [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i8
; SSE2-NEXT: [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 8
; SSE2-NEXT: [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
; SSE2-NEXT: [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24
; SSE2-NEXT: [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE2-NEXT: [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40
; SSE2-NEXT: [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE2-NEXT: [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
; SSE2-NEXT: [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8
; SSE2-NEXT: [[SHR:%.*]] = lshr i8 [[A_SROA_0_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5:%.*]] = lshr i8 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW:%.*]] = add nuw i8 [[SHR5]], [[SHR]]
; SSE2-NEXT: [[OR21:%.*]] = or i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[A_SROA_0_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP0:%.*]] = and i8 [[OR21]], 1
; SSE2-NEXT: [[ADD12:%.*]] = add nuw i8 [[NARROW]], [[TMP0]]
; SSE2-NEXT: [[SHR_1:%.*]] = lshr i8 [[A_SROA_2_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_1:%.*]] = lshr i8 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_1:%.*]] = add nuw i8 [[SHR5_1]], [[SHR_1]]
; SSE2-NEXT: [[OR21_1:%.*]] = or i8 [[B_SROA_2_0_EXTRACT_TRUNC]], [[A_SROA_2_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP1:%.*]] = and i8 [[OR21_1]], 1
; SSE2-NEXT: [[ADD12_1:%.*]] = add nuw i8 [[NARROW_1]], [[TMP1]]
; SSE2-NEXT: [[SHR_2:%.*]] = lshr i8 [[A_SROA_3_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_2:%.*]] = lshr i8 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_2:%.*]] = add nuw i8 [[SHR5_2]], [[SHR_2]]
; SSE2-NEXT: [[OR21_2:%.*]] = or i8 [[B_SROA_3_0_EXTRACT_TRUNC]], [[A_SROA_3_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP2:%.*]] = and i8 [[OR21_2]], 1
; SSE2-NEXT: [[ADD12_2:%.*]] = add nuw i8 [[NARROW_2]], [[TMP2]]
; SSE2-NEXT: [[SHR_3:%.*]] = lshr i8 [[A_SROA_4_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_3:%.*]] = lshr i8 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_3:%.*]] = add nuw i8 [[SHR5_3]], [[SHR_3]]
; SSE2-NEXT: [[OR21_3:%.*]] = or i8 [[B_SROA_4_0_EXTRACT_TRUNC]], [[A_SROA_4_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP3:%.*]] = and i8 [[OR21_3]], 1
; SSE2-NEXT: [[ADD12_3:%.*]] = add nuw i8 [[NARROW_3]], [[TMP3]]
; SSE2-NEXT: [[SHR_4:%.*]] = lshr i8 [[A_SROA_5_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_4:%.*]] = lshr i8 [[B_SROA_5_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_4:%.*]] = add nuw i8 [[SHR5_4]], [[SHR_4]]
; SSE2-NEXT: [[OR21_4:%.*]] = or i8 [[B_SROA_5_0_EXTRACT_TRUNC]], [[A_SROA_5_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP4:%.*]] = and i8 [[OR21_4]], 1
; SSE2-NEXT: [[ADD12_4:%.*]] = add nuw i8 [[NARROW_4]], [[TMP4]]
; SSE2-NEXT: [[SHR_5:%.*]] = lshr i8 [[A_SROA_6_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_5:%.*]] = lshr i8 [[B_SROA_6_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_5:%.*]] = add nuw i8 [[SHR5_5]], [[SHR_5]]
; SSE2-NEXT: [[OR21_5:%.*]] = or i8 [[B_SROA_6_0_EXTRACT_TRUNC]], [[A_SROA_6_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP5:%.*]] = and i8 [[OR21_5]], 1
; SSE2-NEXT: [[ADD12_5:%.*]] = add nuw i8 [[NARROW_5]], [[TMP5]]
; SSE2-NEXT: [[SHR_6:%.*]] = lshr i8 [[A_SROA_7_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_6:%.*]] = lshr i8 [[B_SROA_7_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_6:%.*]] = add nuw i8 [[SHR5_6]], [[SHR_6]]
; SSE2-NEXT: [[OR21_6:%.*]] = or i8 [[B_SROA_7_0_EXTRACT_TRUNC]], [[A_SROA_7_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP6:%.*]] = and i8 [[OR21_6]], 1
; SSE2-NEXT: [[ADD12_6:%.*]] = add nuw i8 [[NARROW_6]], [[TMP6]]
; SSE2-NEXT: [[SHR_7:%.*]] = lshr i8 [[A_SROA_8_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_7:%.*]] = lshr i8 [[B_SROA_8_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_7:%.*]] = add nuw i8 [[SHR5_7]], [[SHR_7]]
; SSE2-NEXT: [[OR21_7:%.*]] = or i8 [[B_SROA_8_0_EXTRACT_TRUNC]], [[A_SROA_8_0_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP7:%.*]] = and i8 [[OR21_7]], 1
; SSE2-NEXT: [[ADD12_7:%.*]] = add nuw i8 [[NARROW_7]], [[TMP7]]
; SSE2-NEXT: [[SHR_8:%.*]] = lshr i8 [[A_SROA_9_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_8:%.*]] = lshr i8 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_8:%.*]] = add nuw i8 [[SHR5_8]], [[SHR_8]]
; SSE2-NEXT: [[OR21_8:%.*]] = or i8 [[B_SROA_9_8_EXTRACT_TRUNC]], [[A_SROA_9_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP8:%.*]] = and i8 [[OR21_8]], 1
; SSE2-NEXT: [[ADD12_8:%.*]] = add nuw i8 [[NARROW_8]], [[TMP8]]
; SSE2-NEXT: [[SHR_9:%.*]] = lshr i8 [[A_SROA_11_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_9:%.*]] = lshr i8 [[B_SROA_11_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_9:%.*]] = add nuw i8 [[SHR5_9]], [[SHR_9]]
; SSE2-NEXT: [[OR21_9:%.*]] = or i8 [[B_SROA_11_8_EXTRACT_TRUNC]], [[A_SROA_11_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP9:%.*]] = and i8 [[OR21_9]], 1
; SSE2-NEXT: [[ADD12_9:%.*]] = add nuw i8 [[NARROW_9]], [[TMP9]]
; SSE2-NEXT: [[SHR_10:%.*]] = lshr i8 [[A_SROA_12_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_10:%.*]] = lshr i8 [[B_SROA_12_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_10:%.*]] = add nuw i8 [[SHR5_10]], [[SHR_10]]
; SSE2-NEXT: [[OR21_10:%.*]] = or i8 [[B_SROA_12_8_EXTRACT_TRUNC]], [[A_SROA_12_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP10:%.*]] = and i8 [[OR21_10]], 1
; SSE2-NEXT: [[ADD12_10:%.*]] = add nuw i8 [[NARROW_10]], [[TMP10]]
; SSE2-NEXT: [[SHR_11:%.*]] = lshr i8 [[A_SROA_13_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_11:%.*]] = lshr i8 [[B_SROA_13_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_11:%.*]] = add nuw i8 [[SHR5_11]], [[SHR_11]]
; SSE2-NEXT: [[OR21_11:%.*]] = or i8 [[B_SROA_13_8_EXTRACT_TRUNC]], [[A_SROA_13_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP11:%.*]] = and i8 [[OR21_11]], 1
; SSE2-NEXT: [[ADD12_11:%.*]] = add nuw i8 [[NARROW_11]], [[TMP11]]
; SSE2-NEXT: [[SHR_12:%.*]] = lshr i8 [[A_SROA_14_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_12:%.*]] = lshr i8 [[B_SROA_14_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_12:%.*]] = add nuw i8 [[SHR5_12]], [[SHR_12]]
; SSE2-NEXT: [[OR21_12:%.*]] = or i8 [[B_SROA_14_8_EXTRACT_TRUNC]], [[A_SROA_14_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP12:%.*]] = and i8 [[OR21_12]], 1
; SSE2-NEXT: [[ADD12_12:%.*]] = add nuw i8 [[NARROW_12]], [[TMP12]]
; SSE2-NEXT: [[SHR_13:%.*]] = lshr i8 [[A_SROA_15_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_13:%.*]] = lshr i8 [[B_SROA_15_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_13:%.*]] = add nuw i8 [[SHR5_13]], [[SHR_13]]
; SSE2-NEXT: [[OR21_13:%.*]] = or i8 [[B_SROA_15_8_EXTRACT_TRUNC]], [[A_SROA_15_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP13:%.*]] = and i8 [[OR21_13]], 1
; SSE2-NEXT: [[ADD12_13:%.*]] = add nuw i8 [[NARROW_13]], [[TMP13]]
; SSE2-NEXT: [[SHR_14:%.*]] = lshr i8 [[A_SROA_16_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_14:%.*]] = lshr i8 [[B_SROA_16_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_14:%.*]] = add nuw i8 [[SHR5_14]], [[SHR_14]]
; SSE2-NEXT: [[OR21_14:%.*]] = or i8 [[B_SROA_16_8_EXTRACT_TRUNC]], [[A_SROA_16_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP14:%.*]] = and i8 [[OR21_14]], 1
; SSE2-NEXT: [[ADD12_14:%.*]] = add nuw i8 [[NARROW_14]], [[TMP14]]
; SSE2-NEXT: [[SHR_15:%.*]] = lshr i8 [[A_SROA_17_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_15:%.*]] = lshr i8 [[B_SROA_17_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_15:%.*]] = add nuw i8 [[SHR5_15]], [[SHR_15]]
; SSE2-NEXT: [[OR21_15:%.*]] = or i8 [[B_SROA_17_8_EXTRACT_TRUNC]], [[A_SROA_17_8_EXTRACT_TRUNC]]
; SSE2-NEXT: [[TMP15:%.*]] = and i8 [[OR21_15]], 1
; SSE2-NEXT: [[ADD12_15:%.*]] = add nuw i8 [[NARROW_15]], [[TMP15]]
; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_7]] to i64
; SSE2-NEXT: [[RETVAL_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_8_0_INSERT_EXT]], 56
; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_6]] to i64
; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_0_INSERT_EXT]], 48
; SSE2-NEXT: [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_SHIFT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_5]] to i64
; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_6_0_INSERT_EXT]], 40
; SSE2-NEXT: [[RETVAL_SROA_6_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], [[RETVAL_SROA_6_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_4]] to i64
; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_5_0_INSERT_EXT]], 32
; SSE2-NEXT: [[RETVAL_SROA_5_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_6_0_INSERT_INSERT]], [[RETVAL_SROA_5_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_3]] to i64
; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 24
; SSE2-NEXT: [[RETVAL_SROA_4_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_5_0_INSERT_INSERT]], [[RETVAL_SROA_4_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_2]] to i64
; SSE2-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 16
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_1]] to i64
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 8
; SSE2-NEXT: [[RETVAL_SROA_2_0_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_INSERT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i8 [[ADD12]] to i64
; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_MASK]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_0_0_INSERT_MASK]], [[RETVAL_SROA_0_0_INSERT_EXT]]
; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_15]] to i64
; SSE2-NEXT: [[RETVAL_SROA_17_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_17_8_INSERT_EXT]], 56
; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_14]] to i64
; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_16_8_INSERT_EXT]], 48
; SSE2-NEXT: [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_SHIFT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_13]] to i64
; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_15_8_INSERT_EXT]], 40
; SSE2-NEXT: [[RETVAL_SROA_15_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], [[RETVAL_SROA_15_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_12]] to i64
; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_14_8_INSERT_EXT]], 32
; SSE2-NEXT: [[RETVAL_SROA_14_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_15_8_INSERT_INSERT]], [[RETVAL_SROA_14_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_11]] to i64
; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_13_8_INSERT_EXT]], 24
; SSE2-NEXT: [[RETVAL_SROA_13_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_14_8_INSERT_INSERT]], [[RETVAL_SROA_13_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_12_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_10]] to i64
; SSE2-NEXT: [[RETVAL_SROA_12_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_12_8_INSERT_EXT]], 16
; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_9]] to i64
; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_11_8_INSERT_EXT]], 8
; SSE2-NEXT: [[RETVAL_SROA_11_8_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_13_8_INSERT_INSERT]], [[RETVAL_SROA_12_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_8]] to i64
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_11_8_INSERT_MASK]], [[RETVAL_SROA_11_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_MASK]], [[RETVAL_SROA_9_8_INSERT_EXT]]
; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_9_8_INSERT_INSERT]], 1
; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; SSE4-LABEL: @avgr_16_u8_alt(
; SSE4-NEXT: entry:
; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 56
; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40
; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE4-NEXT: [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24
; SSE4-NEXT: [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 8
; SSE4-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i8
; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 56
; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE4-NEXT: [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40
; SSE4-NEXT: [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24
; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 8
; SSE4-NEXT: [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i8
; SSE4-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_8_0_EXTRACT_TRUNC]], i64 0
; SSE4-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> [[TMP0]], i8 [[A_SROA_7_0_EXTRACT_TRUNC]], i64 1
; SSE4-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[A_SROA_6_0_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP3:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[A_SROA_5_0_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 4
; SSE4-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 5
; SSE4-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 6
; SSE4-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 7
; SSE4-NEXT: [[TMP8:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1)
; SSE4-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 0
; SSE4-NEXT: [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[B_SROA_7_0_EXTRACT_TRUNC]], i64 1
; SSE4-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP10]], i8 [[B_SROA_6_0_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[B_SROA_5_0_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP13:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 4
; SSE4-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 5
; SSE4-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 6
; SSE4-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[B_SROA_8_0_EXTRACT_TRUNC]], i64 7
; SSE4-NEXT: [[TMP17:%.*]] = lshr <8 x i8> [[TMP16]], splat (i8 1)
; SSE4-NEXT: [[TMP18:%.*]] = add nuw <8 x i8> [[TMP17]], [[TMP8]]
; SSE4-NEXT: [[TMP19:%.*]] = or <8 x i8> [[TMP16]], [[TMP7]]
; SSE4-NEXT: [[TMP20:%.*]] = and <8 x i8> [[TMP19]], splat (i8 1)
; SSE4-NEXT: [[TMP21:%.*]] = add nuw <8 x i8> [[TMP18]], [[TMP20]]
; SSE4-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[TMP21]] to <8 x i64>
; SSE4-NEXT: [[TMP23:%.*]] = shl nuw <8 x i64> [[TMP22]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; SSE4-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]])
; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 56
; SSE4-NEXT: [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE4-NEXT: [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40
; SSE4-NEXT: [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
; SSE4-NEXT: [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24
; SSE4-NEXT: [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16
; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 8
; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i8
; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 56
; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE4-NEXT: [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40
; SSE4-NEXT: [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE4-NEXT: [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24
; SSE4-NEXT: [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
; SSE4-NEXT: [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 8
; SSE4-NEXT: [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8
; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i8
; SSE4-NEXT: [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_17_8_EXTRACT_TRUNC]], i64 0
; SSE4-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[A_SROA_16_8_EXTRACT_TRUNC]], i64 1
; SSE4-NEXT: [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[A_SROA_15_8_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[A_SROA_14_8_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[A_SROA_13_8_EXTRACT_TRUNC]], i64 4
; SSE4-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[A_SROA_12_8_EXTRACT_TRUNC]], i64 5
; SSE4-NEXT: [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[A_SROA_11_8_EXTRACT_TRUNC]], i64 6
; SSE4-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 7
; SSE4-NEXT: [[TMP33:%.*]] = lshr <8 x i8> [[TMP32]], splat (i8 1)
; SSE4-NEXT: [[TMP34:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 0
; SSE4-NEXT: [[TMP35:%.*]] = insertelement <8 x i8> [[TMP34]], i8 [[B_SROA_16_8_EXTRACT_TRUNC]], i64 1
; SSE4-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP35]], i8 [[B_SROA_15_8_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP37:%.*]] = insertelement <8 x i8> [[TMP36]], i8 [[B_SROA_14_8_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[B_SROA_13_8_EXTRACT_TRUNC]], i64 4
; SSE4-NEXT: [[TMP39:%.*]] = insertelement <8 x i8> [[TMP38]], i8 [[B_SROA_12_8_EXTRACT_TRUNC]], i64 5
; SSE4-NEXT: [[TMP40:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[B_SROA_11_8_EXTRACT_TRUNC]], i64 6
; SSE4-NEXT: [[TMP41:%.*]] = insertelement <8 x i8> [[TMP40]], i8 [[B_SROA_17_8_EXTRACT_TRUNC]], i64 7
; SSE4-NEXT: [[TMP42:%.*]] = lshr <8 x i8> [[TMP41]], splat (i8 1)
; SSE4-NEXT: [[TMP43:%.*]] = add nuw <8 x i8> [[TMP42]], [[TMP33]]
; SSE4-NEXT: [[TMP44:%.*]] = or <8 x i8> [[TMP41]], [[TMP32]]
; SSE4-NEXT: [[TMP45:%.*]] = and <8 x i8> [[TMP44]], splat (i8 1)
; SSE4-NEXT: [[TMP46:%.*]] = add nuw <8 x i8> [[TMP43]], [[TMP45]]
; SSE4-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i64>
; SSE4-NEXT: [[TMP48:%.*]] = shl nuw <8 x i64> [[TMP47]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; SSE4-NEXT: [[TMP49:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP48]])
; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP49]], 1
; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX-LABEL: @avgr_16_u8_alt(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> poison, <8 x i32> zeroinitializer
; AVX-NEXT: [[TMP2:%.*]] = lshr <8 x i64> [[TMP1]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP3:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i8>
; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> zeroinitializer
; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i64> [[TMP5]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP7:%.*]] = trunc <8 x i64> [[TMP6]] to <8 x i8>
; AVX-NEXT: [[TMP8:%.*]] = lshr <8 x i8> [[TMP3]], splat (i8 1)
; AVX-NEXT: [[TMP9:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1)
; AVX-NEXT: [[TMP10:%.*]] = add nuw <8 x i8> [[TMP9]], [[TMP8]]
; AVX-NEXT: [[TMP11:%.*]] = or <8 x i8> [[TMP7]], [[TMP3]]
; AVX-NEXT: [[TMP12:%.*]] = and <8 x i8> [[TMP11]], splat (i8 1)
; AVX-NEXT: [[TMP13:%.*]] = add nuw <8 x i8> [[TMP10]], [[TMP12]]
; AVX-NEXT: [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i64>
; AVX-NEXT: [[TMP15:%.*]] = shl nuw <8 x i64> [[TMP14]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP15]])
; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
; AVX-NEXT: [[TMP17:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0
; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> poison, <8 x i32> zeroinitializer
; AVX-NEXT: [[TMP19:%.*]] = lshr <8 x i64> [[TMP18]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP20:%.*]] = trunc <8 x i64> [[TMP19]] to <8 x i8>
; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0
; AVX-NEXT: [[TMP22:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> poison, <8 x i32> zeroinitializer
; AVX-NEXT: [[TMP23:%.*]] = lshr <8 x i64> [[TMP22]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP24:%.*]] = trunc <8 x i64> [[TMP23]] to <8 x i8>
; AVX-NEXT: [[TMP25:%.*]] = lshr <8 x i8> [[TMP20]], splat (i8 1)
; AVX-NEXT: [[TMP26:%.*]] = lshr <8 x i8> [[TMP24]], splat (i8 1)
; AVX-NEXT: [[TMP27:%.*]] = add nuw <8 x i8> [[TMP26]], [[TMP25]]
; AVX-NEXT: [[TMP28:%.*]] = or <8 x i8> [[TMP24]], [[TMP20]]
; AVX-NEXT: [[TMP29:%.*]] = and <8 x i8> [[TMP28]], splat (i8 1)
; AVX-NEXT: [[TMP30:%.*]] = add nuw <8 x i8> [[TMP27]], [[TMP29]]
; AVX-NEXT: [[TMP31:%.*]] = zext <8 x i8> [[TMP30]] to <8 x i64>
; AVX-NEXT: [[TMP32:%.*]] = shl nuw <8 x i64> [[TMP31]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
; AVX-NEXT: [[TMP33:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP32]])
; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
entry:
%retval = alloca %"struct.std::array16", align 1
%a = alloca %"struct.std::array16", align 1
%b = alloca %"struct.std::array16", align 1
store i64 %a.coerce0, ptr %a, align 1
%0 = getelementptr inbounds nuw i8, ptr %a, i64 8
store i64 %a.coerce1, ptr %0, align 1
store i64 %b.coerce0, ptr %b, align 1
%1 = getelementptr inbounds nuw i8, ptr %b, i64 8
store i64 %b.coerce1, ptr %1, align 1
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp samesign ult i64 %i.0, 16
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
%.fca.0.load = load i64, ptr %retval, align 1
%.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
%.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
%.fca.1.load = load i64, ptr %.fca.1.gep, align 1
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
ret { i64, i64 } %.fca.1.insert
for.body: ; preds = %for.cond
%arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0
%2 = load i8, ptr %arrayidx.i, align 1
%arrayidx.i22 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0
%3 = load i8, ptr %arrayidx.i22, align 1
%shr = lshr i8 %2, 1
%shr5 = lshr i8 %3, 1
%narrow = add nuw i8 %shr, %shr5
%or21 = or i8 %2, %3
%4 = and i8 %or21, 1
%add12 = add i8 %narrow, %4
%arrayidx.i23 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0
store i8 %add12, ptr %arrayidx.i23, align 1
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond
}
define { i64, i64 } @avgr_8_u16(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
; SSE2-LABEL: @avgr_8_u16(
; SSE2-NEXT: entry:
; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
; SSE2-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[B_COERCE1:%.*]], i64 1
; SSE2-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
; SSE2-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1
; SSE2-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
; SSE2-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535)
; SSE2-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; SSE2-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP2]], i64 1
; SSE2-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16)
; SSE2-NEXT: [[CONV2_4:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE2-NEXT: [[TMP20:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16)
; SSE2-NEXT: [[CONV_6:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE2-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
; SSE2-NEXT: [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1)
; SSE2-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP8]]
; SSE2-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1)
; SSE2-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1)
; SSE2-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP20]]
; SSE2-NEXT: [[CONV_7:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
; SSE2-NEXT: [[ADD_4:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
; SSE2-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[CONV_7]], 1
; SSE2-NEXT: [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]]
; SSE2-NEXT: [[ADD3_4:%.*]] = add nuw nsw i64 [[ADD_4]], [[CONV2_4]]
; SSE2-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1
; SSE2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
; SSE2-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_6]], [[B_SROA_9_8_EXTRACT_SHIFT]]
; SSE2-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_7]], [[CONV2_2]]
; SSE2-NEXT: [[TMP12:%.*]] = shl nuw i64 [[ADD3_7]], 47
; SSE2-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31
; SSE2-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP12]], -281474976710656
; SSE2-NEXT: [[SHR_4:%.*]] = and i64 [[TMP9]], 281470681743360
; SSE2-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[ADD3_3]], 31
; SSE2-NEXT: [[TMP21:%.*]] = shl nuw i64 [[ADD3_4]], 47
; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP13]], 281470681743360
; SSE2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = and i64 [[TMP21]], -281474976710656
; SSE2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
; SSE2-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[SHR_4]]
; SSE2-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15)
; SSE2-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536)
; SSE2-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64>
; SSE2-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], i64 0
; SSE2-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1
; SSE2-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]]
; SSE2-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]]
; SSE2-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0
; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0
; SSE2-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1
; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1
; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; SSE4-LABEL: @avgr_8_u16(
; SSE4-NEXT: entry:
; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; SSE4-NEXT: [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
; SSE4-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
; SSE4-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; SSE4-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE1:%.*]], i64 1
; SSE4-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE4-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
; SSE4-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1
; SSE4-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
; SSE4-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535)
; SSE4-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; SSE4-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1
; SSE4-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16)
; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
; SSE4-NEXT: [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16)
; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; SSE4-NEXT: [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; SSE4-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
; SSE4-NEXT: [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1)
; SSE4-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]]
; SSE4-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1)
; SSE4-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1)
; SSE4-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]]
; SSE4-NEXT: [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
; SSE4-NEXT: [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
; SSE4-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
; SSE4-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1
; SSE4-NEXT: [[ADD3_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV2_6]]
; SSE4-NEXT: [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]]
; SSE4-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
; SSE4-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1
; SSE4-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]]
; SSE4-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]]
; SSE4-NEXT: [[TMP18:%.*]] = shl nuw i64 [[ADD3_7]], 47
; SSE4-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31
; SSE4-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP18]], -281474976710656
; SSE4-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = and i64 [[TMP19]], 281470681743360
; SSE4-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[ADD3_6]], 31
; SSE4-NEXT: [[TMP21:%.*]] = shl nuw i64 [[ADD3_3]], 47
; SSE4-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP20]], 281470681743360
; SSE4-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = and i64 [[TMP21]], -281474976710656
; SSE4-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
; SSE4-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_EXT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
; SSE4-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15)
; SSE4-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536)
; SSE4-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64>
; SSE4-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], i64 0
; SSE4-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1
; SSE4-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]]
; SSE4-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]]
; SSE4-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0
; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0
; SSE4-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1
; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1
; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX2-LABEL: @avgr_8_u16(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
; AVX2-NEXT: [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
; AVX2-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; AVX2-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[A_COERCE1]], i64 1
; AVX2-NEXT: [[TMP21:%.*]] = lshr <2 x i64> [[TMP20]], <i64 48, i64 32>
; AVX2-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[B_COERCE1:%.*]], i64 1
; AVX2-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; AVX2-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[A_COERCE0]], i64 0
; AVX2-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
; AVX2-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535)
; AVX2-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; AVX2-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1
; AVX2-NEXT: [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16)
; AVX2-NEXT: [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16)
; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
; AVX2-NEXT: [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
; AVX2-NEXT: [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
; AVX2-NEXT: [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1)
; AVX2-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]]
; AVX2-NEXT: [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1)
; AVX2-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1)
; AVX2-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]]
; AVX2-NEXT: [[TMP34:%.*]] = and <2 x i64> [[TMP21]], <i64 -1, i64 65535>
; AVX2-NEXT: [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
; AVX2-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
; AVX2-NEXT: [[TMP37:%.*]] = add nuw nsw <2 x i64> [[TMP34]], <i64 0, i64 1>
; AVX2-NEXT: [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[ADD_3]], i64 0
; AVX2-NEXT: [[TMP25:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[CONV2_6]], i64 1
; AVX2-NEXT: [[TMP38:%.*]] = add nuw nsw <2 x i64> [[TMP25]], [[TMP37]]
; AVX2-NEXT: [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
; AVX2-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1
; AVX2-NEXT: [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]]
; AVX2-NEXT: [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]]
; AVX2-NEXT: [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[ADD3_2]], i64 0
; AVX2-NEXT: [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[ADD3_7]], i64 1
; AVX2-NEXT: [[TMP41:%.*]] = shl nuw <2 x i64> [[TMP40]], <i64 31, i64 47>
; AVX2-NEXT: [[TMP31:%.*]] = and <2 x i64> [[TMP41]], <i64 281470681743360, i64 -281474976710656>
; AVX2-NEXT: [[TMP32:%.*]] = shl nuw <2 x i64> [[TMP38]], <i64 47, i64 31>
; AVX2-NEXT: [[TMP33:%.*]] = and <2 x i64> [[TMP32]], <i64 -281474976710656, i64 281470681743360>
; AVX2-NEXT: [[TMP26:%.*]] = or disjoint <2 x i64> [[TMP31]], [[TMP33]]
; AVX2-NEXT: [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15)
; AVX2-NEXT: [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536)
; AVX2-NEXT: [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64>
; AVX2-NEXT: [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]]
; AVX2-NEXT: [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]]
; AVX2-NEXT: [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0
; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0
; AVX2-NEXT: [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1
; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1
; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX512-LABEL: @avgr_8_u16(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[B_COERCE1:%.*]], i64 1
; AVX512-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], <i64 48, i64 32>
; AVX512-NEXT: [[TMP4:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
; AVX512-NEXT: [[TMP31:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; AVX512-NEXT: [[TMP32:%.*]] = insertelement <2 x i64> [[TMP31]], i64 [[A_COERCE1]], i64 1
; AVX512-NEXT: [[TMP49:%.*]] = lshr <2 x i64> [[TMP32]], <i64 48, i64 32>
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE0]], i64 0
; AVX512-NEXT: [[TMP6:%.*]] = trunc <2 x i64> [[TMP5]] to <2 x i32>
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[A_COERCE0]], i64 0
; AVX512-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 65535)
; AVX512-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP5]], splat (i64 65535)
; AVX512-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP7]], <i64 32, i64 0>
; AVX512-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
; AVX512-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i64 1
; AVX512-NEXT: [[TMP13:%.*]] = lshr <2 x i32> [[TMP12]], splat (i32 16)
; AVX512-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP5]], <i64 32, i64 0>
; AVX512-NEXT: [[TMP15:%.*]] = lshr <2 x i32> [[TMP6]], splat (i32 16)
; AVX512-NEXT: [[TMP16:%.*]] = and <2 x i64> [[TMP10]], <i64 65535, i64 poison>
; AVX512-NEXT: [[TMP17:%.*]] = lshr <2 x i64> [[TMP10]], <i64 65535, i64 48>
; AVX512-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP17]], <2 x i32> <i32 0, i32 3>
; AVX512-NEXT: [[TMP19:%.*]] = and <2 x i64> [[TMP14]], <i64 65535, i64 poison>
; AVX512-NEXT: [[TMP20:%.*]] = lshr <2 x i64> [[TMP14]], <i64 65535, i64 48>
; AVX512-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP19]], <2 x i64> [[TMP20]], <2 x i32> <i32 0, i32 3>
; AVX512-NEXT: [[TMP22:%.*]] = add nuw nsw <2 x i64> [[TMP8]], splat (i64 1)
; AVX512-NEXT: [[TMP23:%.*]] = add nuw nsw <2 x i64> [[TMP22]], [[TMP9]]
; AVX512-NEXT: [[TMP24:%.*]] = lshr <2 x i64> [[TMP23]], splat (i64 1)
; AVX512-NEXT: [[TMP25:%.*]] = add nuw nsw <2 x i32> [[TMP13]], splat (i32 1)
; AVX512-NEXT: [[TMP26:%.*]] = add nuw nsw <2 x i32> [[TMP25]], [[TMP15]]
; AVX512-NEXT: [[TMP30:%.*]] = and <2 x i64> [[TMP49]], <i64 -1, i64 65535>
; AVX512-NEXT: [[TMP27:%.*]] = add nuw nsw <2 x i64> [[TMP3]], <i64 1, i64 poison>
; AVX512-NEXT: [[TMP28:%.*]] = and <2 x i64> [[TMP3]], <i64 poison, i64 65535>
; AVX512-NEXT: [[TMP29:%.*]] = shufflevector <2 x i64> [[TMP27]], <2 x i64> [[TMP28]], <2 x i32> <i32 0, i32 3>
; AVX512-NEXT: [[TMP33:%.*]] = add nuw nsw <2 x i64> [[TMP30]], <i64 0, i64 1>
; AVX512-NEXT: [[TMP34:%.*]] = add nuw nsw <2 x i64> [[TMP29]], [[TMP33]]
; AVX512-NEXT: [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP18]], splat (i64 1)
; AVX512-NEXT: [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP21]]
; AVX512-NEXT: [[TMP37:%.*]] = shl nuw <2 x i64> [[TMP36]], <i64 31, i64 47>
; AVX512-NEXT: [[TMP38:%.*]] = and <2 x i64> [[TMP37]], <i64 281470681743360, i64 -281474976710656>
; AVX512-NEXT: [[TMP39:%.*]] = shl nuw <2 x i64> [[TMP34]], <i64 47, i64 31>
; AVX512-NEXT: [[TMP40:%.*]] = and <2 x i64> [[TMP39]], <i64 -281474976710656, i64 281470681743360>
; AVX512-NEXT: [[TMP41:%.*]] = or disjoint <2 x i64> [[TMP38]], [[TMP40]]
; AVX512-NEXT: [[TMP42:%.*]] = shl nuw <2 x i32> [[TMP26]], splat (i32 15)
; AVX512-NEXT: [[TMP43:%.*]] = and <2 x i32> [[TMP42]], splat (i32 -65536)
; AVX512-NEXT: [[TMP44:%.*]] = zext <2 x i32> [[TMP43]] to <2 x i64>
; AVX512-NEXT: [[TMP45:%.*]] = or disjoint <2 x i64> [[TMP41]], [[TMP44]]
; AVX512-NEXT: [[TMP46:%.*]] = or disjoint <2 x i64> [[TMP45]], [[TMP24]]
; AVX512-NEXT: [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0
; AVX512-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP47]], 0
; AVX512-NEXT: [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1
; AVX512-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP48]], 1
; AVX512-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
entry:
%retval = alloca %"struct.std::array8", align 2
%a = alloca %"struct.std::array8", align 2
%b = alloca %"struct.std::array8", align 2
store i64 %a.coerce0, ptr %a, align 2
%0 = getelementptr inbounds nuw i8, ptr %a, i64 8
store i64 %a.coerce1, ptr %0, align 2
store i64 %b.coerce0, ptr %b, align 2
%1 = getelementptr inbounds nuw i8, ptr %b, i64 8
store i64 %b.coerce1, ptr %1, align 2
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp samesign ult i64 %i.0, 8
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
%.fca.0.load = load i64, ptr %retval, align 2
%.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
%.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
%.fca.1.load = load i64, ptr %.fca.1.gep, align 2
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
ret { i64, i64 } %.fca.1.insert
for.body: ; preds = %for.cond
%arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0
%2 = load i16, ptr %arrayidx.i, align 2
%conv = zext i16 %2 to i32
%arrayidx.i10 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0
%3 = load i16, ptr %arrayidx.i10, align 2
%conv2 = zext i16 %3 to i32
%add = add nuw nsw i32 %conv, %conv2
%add3 = add nuw nsw i32 %add, 1
%shr = lshr i32 %add3, 1
%conv4 = trunc nuw i32 %shr to i16
%arrayidx.i11 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0
store i16 %conv4, ptr %arrayidx.i11, align 2
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond
}
define { i64, i64 } @avgr_8_u16_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
; SSE2-LABEL: @avgr_8_u16_alt(
; SSE2-NEXT: entry:
; SSE2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48
; SSE2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
; SSE2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48
; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
; SSE2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW:%.*]] = add nuw i16 [[SHR5_3]], [[SHR5]]
; SSE2-NEXT: [[NARROW_1:%.*]] = add nuw i16 [[SHR5_2]], [[SHR5_1]]
; SSE2-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0]] to i16
; SSE2-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
; SSE2-NEXT: [[TMP17:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP14]], i16 [[TMP17]], i64 1
; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0]] to i16
; SSE2-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0
; SSE2-NEXT: [[TMP6:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP6]], i64 1
; SSE2-NEXT: [[TMP19:%.*]] = lshr <2 x i16> [[TMP18]], splat (i16 1)
; SSE2-NEXT: [[TMP21:%.*]] = lshr <2 x i16> [[TMP7]], splat (i16 1)
; SSE2-NEXT: [[TMP10:%.*]] = add nuw <2 x i16> [[TMP21]], [[TMP19]]
; SSE2-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> [[TMP37]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 2
; SSE2-NEXT: [[TMP39:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
; SSE2-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3
; SSE2-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP39]], [[TMP3]]
; SSE2-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1)
; SSE2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2
; SSE2-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3
; SSE2-NEXT: [[TMP15:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]]
; SSE2-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64
; SSE2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
; SSE2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48
; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE2-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
; SSE2-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2:%.*]], 48
; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT1:%.*]] = lshr i64 [[B_COERCE2]], 32
; SSE2-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2]], 16
; SSE2-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT1]] to i16
; SSE2-NEXT: [[SHR_6:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR_7:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
; SSE2-NEXT: [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]]
; SSE2-NEXT: [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]]
; SSE2-NEXT: [[TMP40:%.*]] = trunc i64 [[B_COERCE1]] to i16
; SSE2-NEXT: [[TMP41:%.*]] = insertelement <2 x i16> poison, i16 [[TMP40]], i64 0
; SSE2-NEXT: [[TMP42:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP41]], i16 [[TMP42]], i64 1
; SSE2-NEXT: [[TMP28:%.*]] = trunc i64 [[B_COERCE2]] to i16
; SSE2-NEXT: [[TMP29:%.*]] = insertelement <2 x i16> poison, i16 [[TMP28]], i64 0
; SSE2-NEXT: [[TMP45:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
; SSE2-NEXT: [[TMP46:%.*]] = insertelement <2 x i16> [[TMP29]], i16 [[TMP45]], i64 1
; SSE2-NEXT: [[TMP32:%.*]] = lshr <2 x i16> [[TMP27]], splat (i16 1)
; SSE2-NEXT: [[TMP47:%.*]] = lshr <2 x i16> [[TMP46]], splat (i16 1)
; SSE2-NEXT: [[TMP34:%.*]] = add nuw <2 x i16> [[TMP47]], [[TMP32]]
; SSE2-NEXT: [[TMP35:%.*]] = shufflevector <2 x i16> [[TMP46]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 2
; SSE2-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 3
; SSE2-NEXT: [[TMP22:%.*]] = shufflevector <2 x i16> [[TMP27]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
; SSE2-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
; SSE2-NEXT: [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]]
; SSE2-NEXT: [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1)
; SSE2-NEXT: [[TMP43:%.*]] = shufflevector <2 x i16> [[TMP34]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP43]], i16 [[NARROW_7]], i64 2
; SSE2-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[NARROW_6]], i64 3
; SSE2-NEXT: [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]]
; SSE2-NEXT: [[TMP33:%.*]] = bitcast <4 x i16> [[TMP31]] to i64
; SSE2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
; SSE2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; SSE4-LABEL: @avgr_8_u16_alt(
; SSE4-NEXT: entry:
; SSE4-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48
; SSE4-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; SSE4-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
; SSE4-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48
; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; SSE4-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
; SSE4-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[NARROW:%.*]] = add nuw i16 [[SHR5_3]], [[SHR5]]
; SSE4-NEXT: [[NARROW_1:%.*]] = add nuw i16 [[SHR5_2]], [[SHR5_1]]
; SSE4-NEXT: [[TMP0:%.*]] = trunc i64 [[A_COERCE0]] to i16
; SSE4-NEXT: [[TMP14:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
; SSE4-NEXT: [[TMP17:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> [[TMP14]], i16 [[TMP17]], i64 1
; SSE4-NEXT: [[TMP4:%.*]] = trunc i64 [[B_COERCE0]] to i16
; SSE4-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0
; SSE4-NEXT: [[TMP6:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP6]], i64 1
; SSE4-NEXT: [[TMP19:%.*]] = lshr <2 x i16> [[TMP18]], splat (i16 1)
; SSE4-NEXT: [[TMP21:%.*]] = lshr <2 x i16> [[TMP7]], splat (i16 1)
; SSE4-NEXT: [[TMP10:%.*]] = add nuw <2 x i16> [[TMP21]], [[TMP19]]
; SSE4-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> [[TMP37]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP39:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP39]], [[TMP3]]
; SSE4-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1)
; SSE4-NEXT: [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2
; SSE4-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3
; SSE4-NEXT: [[TMP15:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]]
; SSE4-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64
; SSE4-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
; SSE4-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48
; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; SSE4-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
; SSE4-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2:%.*]], 48
; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT1:%.*]] = lshr i64 [[B_COERCE2]], 32
; SSE4-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE2]], 16
; SSE4-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT1]] to i16
; SSE4-NEXT: [[SHR_6:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR_7:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
; SSE4-NEXT: [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]]
; SSE4-NEXT: [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]]
; SSE4-NEXT: [[TMP40:%.*]] = trunc i64 [[B_COERCE1]] to i16
; SSE4-NEXT: [[TMP41:%.*]] = insertelement <2 x i16> poison, i16 [[TMP40]], i64 0
; SSE4-NEXT: [[TMP42:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[TMP27:%.*]] = insertelement <2 x i16> [[TMP41]], i16 [[TMP42]], i64 1
; SSE4-NEXT: [[TMP28:%.*]] = trunc i64 [[B_COERCE2]] to i16
; SSE4-NEXT: [[TMP29:%.*]] = insertelement <2 x i16> poison, i16 [[TMP28]], i64 0
; SSE4-NEXT: [[TMP45:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
; SSE4-NEXT: [[TMP46:%.*]] = insertelement <2 x i16> [[TMP29]], i16 [[TMP45]], i64 1
; SSE4-NEXT: [[TMP32:%.*]] = lshr <2 x i16> [[TMP27]], splat (i16 1)
; SSE4-NEXT: [[TMP47:%.*]] = lshr <2 x i16> [[TMP46]], splat (i16 1)
; SSE4-NEXT: [[TMP34:%.*]] = add nuw <2 x i16> [[TMP47]], [[TMP32]]
; SSE4-NEXT: [[TMP35:%.*]] = shufflevector <2 x i16> [[TMP46]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP22:%.*]] = shufflevector <2 x i16> [[TMP27]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
; SSE4-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
; SSE4-NEXT: [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]]
; SSE4-NEXT: [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1)
; SSE4-NEXT: [[TMP43:%.*]] = shufflevector <2 x i16> [[TMP34]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP43]], i16 [[NARROW_7]], i64 2
; SSE4-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[NARROW_6]], i64 3
; SSE4-NEXT: [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]]
; SSE4-NEXT: [[TMP33:%.*]] = bitcast <4 x i16> [[TMP31]] to i64
; SSE4-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
; SSE4-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX2-LABEL: @avgr_8_u16_alt(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 48
; AVX2-NEXT: [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
; AVX2-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i16
; AVX2-NEXT: [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 48
; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
; AVX2-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
; AVX2-NEXT: [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i16
; AVX2-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 0
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 1
; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 2
; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 3
; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
; AVX2-NEXT: [[TMP5:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 0
; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 1
; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
; AVX2-NEXT: [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 3
; AVX2-NEXT: [[TMP9:%.*]] = lshr <4 x i16> [[TMP8]], splat (i16 1)
; AVX2-NEXT: [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP4]]
; AVX2-NEXT: [[TMP11:%.*]] = or <4 x i16> [[TMP8]], [[TMP3]]
; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1)
; AVX2-NEXT: [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]]
; AVX2-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0
; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 48
; AVX2-NEXT: [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
; AVX2-NEXT: [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16
; AVX2-NEXT: [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_8_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i16
; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 48
; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
; AVX2-NEXT: [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
; AVX2-NEXT: [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
; AVX2-NEXT: [[B_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i16
; AVX2-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 0
; AVX2-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[A_SROA_8_8_EXTRACT_TRUNC]], i64 1
; AVX2-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
; AVX2-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
; AVX2-NEXT: [[TMP20:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1)
; AVX2-NEXT: [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_5_8_EXTRACT_TRUNC]], i64 0
; AVX2-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 1
; AVX2-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[B_SROA_7_8_EXTRACT_TRUNC]], i64 2
; AVX2-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 3
; AVX2-NEXT: [[TMP25:%.*]] = lshr <4 x i16> [[TMP24]], splat (i16 1)
; AVX2-NEXT: [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP20]]
; AVX2-NEXT: [[TMP27:%.*]] = or <4 x i16> [[TMP24]], [[TMP19]]
; AVX2-NEXT: [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1)
; AVX2-NEXT: [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]]
; AVX2-NEXT: [[TMP31:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1
; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
; AVX512-LABEL: @avgr_8_u16_alt(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
; AVX512-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[TMP1]], <i64 0, i64 16, i64 32, i64 48>
; AVX512-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i16>
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> zeroinitializer
; AVX512-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP5]], <i64 0, i64 16, i64 32, i64 48>
; AVX512-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i16>
; AVX512-NEXT: [[TMP8:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
; AVX512-NEXT: [[TMP9:%.*]] = lshr <4 x i16> [[TMP7]], splat (i16 1)
; AVX512-NEXT: [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP8]]
; AVX512-NEXT: [[TMP11:%.*]] = or <4 x i16> [[TMP7]], [[TMP3]]
; AVX512-NEXT: [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1)
; AVX512-NEXT: [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]]
; AVX512-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
; AVX512-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0
; AVX512-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0
; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> zeroinitializer
; AVX512-NEXT: [[TMP18:%.*]] = lshr <4 x i64> [[TMP17]], <i64 0, i64 16, i64 32, i64 48>
; AVX512-NEXT: [[TMP19:%.*]] = trunc <4 x i64> [[TMP18]] to <4 x i16>
; AVX512-NEXT: [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0
; AVX512-NEXT: [[TMP21:%.*]] = shufflevector <4 x i64> [[TMP20]], <4 x i64> poison, <4 x i32> zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = lshr <4 x i64> [[TMP21]], <i64 0, i64 16, i64 32, i64 48>
; AVX512-NEXT: [[TMP23:%.*]] = trunc <4 x i64> [[TMP22]] to <4 x i16>
; AVX512-NEXT: [[TMP24:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1)
; AVX512-NEXT: [[TMP25:%.*]] = lshr <4 x i16> [[TMP23]], splat (i16 1)
; AVX512-NEXT: [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP24]]
; AVX512-NEXT: [[TMP27:%.*]] = or <4 x i16> [[TMP23]], [[TMP19]]
; AVX512-NEXT: [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1)
; AVX512-NEXT: [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]]
; AVX512-NEXT: [[TMP31:%.*]] = bitcast <4 x i16> [[TMP29]] to i64
; AVX512-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1
; AVX512-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]]
;
entry:
%retval = alloca %"struct.std::array8", align 2
%a = alloca %"struct.std::array8", align 2
%b = alloca %"struct.std::array8", align 2
store i64 %a.coerce0, ptr %a, align 2
%0 = getelementptr inbounds nuw i8, ptr %a, i64 8
store i64 %a.coerce1, ptr %0, align 2
store i64 %b.coerce0, ptr %b, align 2
%1 = getelementptr inbounds nuw i8, ptr %b, i64 8
store i64 %b.coerce1, ptr %1, align 2
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp samesign ult i64 %i.0, 8
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
%.fca.0.load = load i64, ptr %retval, align 2
%.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
%.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
%.fca.1.load = load i64, ptr %.fca.1.gep, align 2
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
ret { i64, i64 } %.fca.1.insert
for.body: ; preds = %for.cond
%arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0
%2 = load i16, ptr %arrayidx.i, align 2
%arrayidx.i22 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0
%3 = load i16, ptr %arrayidx.i22, align 2
%shr = lshr i16 %2, 1
%shr5 = lshr i16 %3, 1
%narrow = add nuw i16 %shr, %shr5
%or21 = or i16 %2, %3
%4 = and i16 %or21, 1
%add12 = add i16 %narrow, %4
%arrayidx.i23 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0
store i16 %add12, ptr %arrayidx.i23, align 2
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}