llvm/test/Transforms/AggressiveInstCombine/popcount.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s

 ;int popcount8(unsigned char i) {
 ;  i = i - ((i >> 1) & 0x55);
 ;  i = (i & 0x33) + ((i >> 2) & 0x33);
 ;  i = ((i + (i >> 4)) & 0x0F);
 ; return (i * 0x01010101);
 ;}
 define signext i32 @popcount8(i8 zeroext %0) {
 ; CHECK-LABEL: @popcount8(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], 85
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], 51
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], 51
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i8 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i8 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i8 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], 15
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP11]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP12]]
 ;
   %2 = lshr i8 %0, 1
   %3 = and i8 %2, 85
   %4 = sub i8 %0, %3
   %5 = and i8 %4, 51
   %6 = lshr i8 %4, 2
   %7 = and i8 %6, 51
   %8 = add nuw nsw i8 %7, %5
   %9 = lshr i8 %8, 4
   %10 = add nuw nsw i8 %9, %8
   %11 = and i8 %10, 15
   %12 = zext i8 %11 to i32
   ret i32 %12
 }

 ;int popcount32(unsigned i) {
 ;  i = i - ((i >> 1) & 0x55555555);
 ;  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
 ;  i = ((i + (i >> 4)) & 0x0F0F0F0F);
 ; return (i * 0x01010101) >> 24;
 ;}
 define signext i32 @popcount32(i32 zeroext %0) {
 ; CHECK-LABEL: @popcount32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %2 = lshr i32 %0, 1
   %3 = and i32 %2, 1431655765
   %4 = sub i32 %0, %3
   %5 = and i32 %4, 858993459
   %6 = lshr i32 %4, 2
   %7 = and i32 %6, 858993459
   %8 = add nuw nsw i32 %7, %5
   %9 = lshr i32 %8, 4
   %10 = add nuw nsw i32 %9, %8
   %11 = and i32 %10, 252645135
   %12 = mul i32 %11, 16843009
   %13 = lshr i32 %12, 24
   ret i32 %13
 }

 ;int popcount64(unsigned long long i) {
 ;  i = i - ((i >> 1) & 0x5555555555555555);
 ;  i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
 ;  i = ((i + (i >> 4)) & 0x0F0F0F0F0F0F0F0F);
 ; return (i * 0x0101010101010101) >> 56;
 ;}
 define signext i32 @popcount64(i64 %0) {
 ; CHECK-LABEL: @popcount64(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = lshr i64 %0, 1
   %3 = and i64 %2, 6148914691236517205
   %4 = sub i64 %0, %3
   %5 = and i64 %4, 3689348814741910323
   %6 = lshr i64 %4, 2
   %7 = and i64 %6, 3689348814741910323
   %8 = add nuw nsw i64 %7, %5
   %9 = lshr i64 %8, 4
   %10 = add nuw nsw i64 %9, %8
   %11 = and i64 %10, 1085102592571150095
   %12 = mul i64 %11, 72340172838076673
   %13 = lshr i64 %12, 56
   %14 = trunc i64 %13 to i32
   ret i32 %14
 }

 ;int popcount128(__uint128_t i) {
 ;  __uint128_t x = 0x5555555555555555;
 ;  x <<= 64;
 ;  x |= 0x5555555555555555;
 ;  __uint128_t y = 0x3333333333333333;
 ;  y <<= 64;
 ;  y |= 0x3333333333333333;
 ;  __uint128_t z = 0x0f0f0f0f0f0f0f0f;
 ;  z <<= 64;
 ;  z |= 0x0f0f0f0f0f0f0f0f;
 ;  __uint128_t a = 0x0101010101010101;
 ;  a <<= 64;
 ;  a |= 0x0101010101010101;
 ;  unsigned mask = 120;
 ;  i = i - ((i >> 1) & x);
 ;  i = (i & y) + ((i >> 2) & y);
 ;  i = ((i + (i >> 4)) & z);
 ;  return (i * a) >> mask;
 ;}
 define signext i32 @popcount128(i128 %0) {
 ; CHECK-LABEL: @popcount128(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i128 @llvm.ctpop.i128(i128 [[TMP0:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i128 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %2 = lshr i128 %0, 1
   %3 = and i128 %2, 113427455640312821154458202477256070485
   %4 = sub i128 %0, %3
   %5 = and i128 %4, 68056473384187692692674921486353642291
   %6 = lshr i128 %4, 2
   %7 = and i128 %6, 68056473384187692692674921486353642291
   %8 = add nuw nsw i128 %7, %5
   %9 = lshr i128 %8, 4
   %10 = add nuw nsw i128 %9, %8
   %11 = and i128 %10, 20016609818878733144904388672456953615
   %12 = mul i128 %11, 1334440654591915542993625911497130241
   %13 = lshr i128 %12, 120
   %14 = trunc i128 %13 to i32
   ret i32 %14
 }

 ;vector unsigned char popcount8vec(vector unsigned char i)
 ;{
 ;  i = i - ((i>> 1) & 0x55);
 ;  i = (i & 0x33) + ((i >> 2) & 0x33);
 ;  i = ((i + (i >> 4)) & 0x0F);
 ;  return (i * 0x01);
 ;}
 define <16 x i8> @popcount8vec(<16 x i8> %0) {
 ; CHECK-LABEL: @popcount8vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[TMP0:%.*]], splat (i8 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i8> [[TMP2]], splat (i8 85)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i8> [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP4]], splat (i8 51)
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], splat (i8 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP6]], splat (i8 51)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i8> [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP8]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <16 x i8> [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i8> [[TMP10]], splat (i8 15)
 ; CHECK-NEXT:    ret <16 x i8> [[TMP11]]
 ;
   %2 = lshr <16 x i8> %0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %3 = and <16 x i8> %2, <i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85>
   %4 = sub <16 x i8> %0, %3
   %5 = and <16 x i8> %4, <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
   %6 = lshr <16 x i8> %4, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   %7 = and <16 x i8> %6, <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
   %8 = add nuw nsw <16 x i8> %7, %5
   %9 = lshr <16 x i8> %8, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %10 = add nuw nsw <16 x i8> %9, %8
   %11 = and <16 x i8> %10, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
   ret <16 x i8> %11
 }

 ;vector unsigned int popcount32vec(vector unsigned int i)
 ;{
 ;  i = i - ((i>> 1) & 0x55555555);
 ;  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
 ;  i = ((i + (i >> 4)) & 0x0F0F0F0F);
 ;  return (i * 0x01010101) >> 24;
 ;}
 define <4 x i32> @popcount32vec(<4 x i32> %0) {
 ; CHECK-LABEL: @popcount32vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %2 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
   %3 = and <4 x i32> %2, <i32 1431655765, i32 1431655765, i32 1431655765, i32 1431655765>
   %4 = sub <4 x i32> %0, %3
   %5 = and <4 x i32> %4, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
   %6 = lshr <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2>
   %7 = and <4 x i32> %6, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
   %8 = add nuw nsw <4 x i32> %7, %5
   %9 = lshr <4 x i32> %8, <i32 4, i32 4, i32 4, i32 4>
   %10 = add nuw nsw <4 x i32> %9, %8
   %11 = and <4 x i32> %10, <i32 252645135, i32 252645135, i32 252645135, i32 252645135>
   %12 = mul <4 x i32> %11, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
   %13 = lshr <4 x i32> %12, <i32 24, i32 24, i32 24, i32 24>
   ret <4 x i32> %13
 }

 define i32 @popcount64_zext(i32 %x) {
 ; CHECK-LABEL: @popcount64_zext(
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[X:%.*]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.ctpop.i64(i64 [[ZEXT]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw nsw i64 [[TMP12]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
   %zext = zext i32 %x to i64
   %1 = lshr i64 %zext, 1
   %2 = and i64 %1, 1431655765
   %3 = sub nsw i64 %zext, %2
   %4 = and i64 %3, 3689348814741910323
   %5 = lshr i64 %3, 2
   %6 = and i64 %5, 3689348814741910323
   %7 = add nuw nsw i64 %6, %4
   %8 = lshr i64 %7, 4
   %9 = add nuw nsw i64 %8, %7
   %10 = and i64 %9, 1085102592571150095
   %11 = mul i64 %10, 72340172838076673
   %12 = lshr i64 %11, 56
   %13 = trunc nuw nsw i64 %12 to i32
   ret i32 %13
 }

 define i32 @popcount64_mask(i64 %x) {
 ; CHECK-LABEL: @popcount64_mask(
 ; CHECK-NEXT:    [[MASK:%.*]] = and i64 [[X:%.*]], -281470681808896
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.ctpop.i64(i64 [[MASK]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw nsw i64 [[TMP12]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
   %mask = and i64 %x, -281470681808896 ; 0xffff0000ffff0000
   %1 = lshr i64 %mask, 1
   %2 = and i64 %1, 6148820867675914240 ; 0x0x5555000055550000
   %3 = sub nsw i64 %mask, %2
   %4 = and i64 %3, 3689348814741910323
   %5 = lshr i64 %3, 2
   %6 = and i64 %5, 3689348814741910323
   %7 = add nuw nsw i64 %6, %4
   %8 = lshr i64 %7, 4
   %9 = add nuw nsw i64 %8, %7
   %10 = and i64 %9, 1085102592571150095
   %11 = mul i64 %10, 72340172838076673
   %12 = lshr i64 %11, 56
   %13 = trunc nuw nsw i64 %12 to i32
   ret i32 %13
 }

 define i32 @popcnt2_32(i32 noundef %0)  {
 ; CHECK-LABEL: define i32 @popcnt2_32(
 ; CHECK-SAME: i32 noundef [[TMP0:%.*]])  {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP0]])
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %2 = lshr i32 %0, 1
   %3 = and i32 %2, 1431655765
   %4 = sub i32 %0, %3
   %5 = and i32 %4, 858993459
   %6 = lshr i32 %4, 2
   %7 = and i32 %6, 858993459
   %8 = add nuw nsw i32 %7, %5
   %9 = lshr i32 %8, 4
   %10 = add nuw nsw i32 %9, %8
   %11 = and i32 %10, 252645135
   %12 = lshr i32 %11, 8
   %13 = add nuw nsw i32 %12, %11
   %14 = lshr i32 %13, 16
   %15 = add nuw nsw i32 %14, %13
   %16 = and i32 %15, 63
   ret i32 %16
 }

 define i32 @popcnt3_32(i32 noundef %0) {
 ; CHECK-LABEL: define i32 @popcnt3_32(
 ; CHECK-SAME: i32 noundef [[TMP0:%.*]]) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP0]])
 ; CHECK-NEXT:    ret i32 [[TMP16]]
 ;
   %2 = lshr i32 %0, 1
   %3 = and i32 %2, 1431655765
   %4 = sub i32 %0, %3
   %5 = lshr i32 %4, 2
   %6 = and i32 %5, 858993459
   %7 = mul i32 %6, -3
   %8 = add i32 %7, %4
   %9 = lshr i32 %8, 4
   %10 = add i32 %9, %8
   %11 = and i32 %10, 252645135
   %12 = lshr i32 %11, 8
   %13 = add nuw nsw i32 %12, %11
   %14 = lshr i32 %13, 16
   %15 = add nuw nsw i32 %14, %13
   %16 = and i32 %15, 63
   ret i32 %16
 }

 ; 16-bit scalar popcount
 define i16 @popcnt2_16(i16 noundef %0) {
 ; CHECK-LABEL: @popcnt2_16(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.ctpop.i16(i16 [[TMP0:%.*]])
 ; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %2 = lshr i16 %0, 1
   %3 = and i16 %2, 21845
   %4 = sub i16 %0, %3
   %5 = and i16 %4, 13107
   %6 = lshr i16 %4, 2
   %7 = and i16 %6, 13107
   %8 = add nuw nsw i16 %7, %5
   %9 = lshr i16 %8, 4
   %10 = add nuw nsw i16 %9, %8
   %11 = and i16 %10, 3855
   %12 = lshr i16 %11, 8
   %13 = add nuw nsw i16 %12, %11
   %14 = and i16 %13, 31
   ret i16 %14
 }

 ; 64-bit scalar popcount
 define i64 @popcnt2_64(i64 noundef %0) {
 ; CHECK-LABEL: @popcnt2_64(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0:%.*]])
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %2 = lshr i64 %0, 1
   %3 = and i64 %2, 6148914691236517205
   %4 = sub i64 %0, %3
   %5 = and i64 %4, 3689348814741910323
   %6 = lshr i64 %4, 2
   %7 = and i64 %6, 3689348814741910323
   %8 = add nuw nsw i64 %7, %5
   %9 = lshr i64 %8, 4
   %10 = add nuw nsw i64 %9, %8
   %11 = and i64 %10, 1085102592571150095
   %12 = lshr i64 %11, 8
   %13 = add nuw nsw i64 %12, %11
   %14 = lshr i64 %13, 16
   %15 = add nuw nsw i64 %14, %13
   %16 = lshr i64 %15, 32
   %17 = add nuw nsw i64 %16, %15
   %18 = and i64 %17, 127
   ret i64 %18
 }

 ; 16-bit vector popcount
 define <8 x i16> @popcnt2_16vec(<8 x i16> %0) {
 ; CHECK-LABEL: @popcnt2_16vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
   %2 = lshr <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %3 = and <8 x i16> %2, <i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845>
   %4 = sub <8 x i16> %0, %3
   %5 = and <8 x i16> %4, <i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107>
   %6 = lshr <8 x i16> %4, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %7 = and <8 x i16> %6, <i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107>
   %8 = add nuw nsw <8 x i16> %7, %5
   %9 = lshr <8 x i16> %8, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
   %10 = add nuw nsw <8 x i16> %9, %8
   %11 = and <8 x i16> %10, <i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855>
   %12 = lshr <8 x i16> %11, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %13 = add nuw nsw <8 x i16> %12, %11
   %14 = and <8 x i16> %13, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
   ret <8 x i16> %14
 }

 ; 32-bit vector popcount (variant 2) - 4 elements
 define <4 x i32> @popcnt2_32vec(<4 x i32> %0) {
 ; CHECK-LABEL: @popcnt2_32vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %2 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
   %3 = and <4 x i32> %2, <i32 1431655765, i32 1431655765, i32 1431655765, i32 1431655765>
   %4 = sub <4 x i32> %0, %3
   %5 = and <4 x i32> %4, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
   %6 = lshr <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2>
   %7 = and <4 x i32> %6, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
   %8 = add nuw nsw <4 x i32> %7, %5
   %9 = lshr <4 x i32> %8, <i32 4, i32 4, i32 4, i32 4>
   %10 = add nuw nsw <4 x i32> %9, %8
   %11 = and <4 x i32> %10, <i32 252645135, i32 252645135, i32 252645135, i32 252645135>
   %12 = lshr <4 x i32> %11, <i32 8, i32 8, i32 8, i32 8>
   %13 = add nuw nsw <4 x i32> %12, %11
   %14 = lshr <4 x i32> %13, <i32 16, i32 16, i32 16, i32 16>
   %15 = add nuw nsw <4 x i32> %14, %13
   %16 = and <4 x i32> %15, <i32 63, i32 63, i32 63, i32 63>
   ret <4 x i32> %16
 }

 ; 64-bit vector popcount
 define <2 x i64> @popcnt2_64vec(<2 x i64> %0) {
 ; CHECK-LABEL: @popcnt2_64vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %2 = lshr <2 x i64> %0, <i64 1, i64 1>
   %3 = and <2 x i64> %2, <i64 6148914691236517205, i64 6148914691236517205>
   %4 = sub <2 x i64> %0, %3
   %5 = and <2 x i64> %4, <i64 3689348814741910323, i64 3689348814741910323>
   %6 = lshr <2 x i64> %4, <i64 2, i64 2>
   %7 = and <2 x i64> %6, <i64 3689348814741910323, i64 3689348814741910323>
   %8 = add nuw nsw <2 x i64> %7, %5
   %9 = lshr <2 x i64> %8, <i64 4, i64 4>
   %10 = add nuw nsw <2 x i64> %9, %8
   %11 = and <2 x i64> %10, <i64 1085102592571150095, i64 1085102592571150095>
   %12 = lshr <2 x i64> %11, <i64 8, i64 8>
   %13 = add nuw nsw <2 x i64> %12, %11
   %14 = lshr <2 x i64> %13, <i64 16, i64 16>
   %15 = add nuw nsw <2 x i64> %14, %13
   %16 = lshr <2 x i64> %15, <i64 32, i64 32>
   %17 = add nuw nsw <2 x i64> %16, %15
   %18 = and <2 x i64> %17, <i64 127, i64 127>
   ret <2 x i64> %18
 }

 ; 16-bit scalar popcount (variant 3 - using multiply by -3)
 define i16 @popcnt3_16(i16 noundef %0) {
 ; CHECK-LABEL: @popcnt3_16(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.ctpop.i16(i16 [[TMP0:%.*]])
 ; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %2 = lshr i16 %0, 1
   %3 = and i16 %2, 21845
   %4 = sub i16 %0, %3
   %5 = lshr i16 %4, 2
   %6 = and i16 %5, 13107
   %7 = mul i16 %6, -3
   %8 = add i16 %7, %4
   %9 = lshr i16 %8, 4
   %10 = add i16 %9, %8
   %11 = and i16 %10, 3855
   %12 = lshr i16 %11, 8
   %13 = add nuw nsw i16 %12, %11
   %14 = and i16 %13, 31
   ret i16 %14
 }

 ; 64-bit scalar popcount (variant 3 - using multiply by -3)
 define i64 @popcnt3_64(i64 noundef %0) {
 ; CHECK-LABEL: @popcnt3_64(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0:%.*]])
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %2 = lshr i64 %0, 1
   %3 = and i64 %2, 6148914691236517205
   %4 = sub i64 %0, %3
   %5 = lshr i64 %4, 2
   %6 = and i64 %5, 3689348814741910323
   %7 = mul i64 %6, -3
   %8 = add i64 %7, %4
   %9 = lshr i64 %8, 4
   %10 = add i64 %9, %8
   %11 = and i64 %10, 1085102592571150095
   %12 = lshr i64 %11, 8
   %13 = add nuw nsw i64 %12, %11
   %14 = lshr i64 %13, 16
   %15 = add nuw nsw i64 %14, %13
   %16 = lshr i64 %15, 32
   %17 = add nuw nsw i64 %16, %15
   %18 = and i64 %17, 127
   ret i64 %18
 }

 ; 16-bit vector popcount (variant 3 - using multiply by -3) - 8 elements
 define <8 x i16> @popcnt3_16vec(<8 x i16> %0) {
 ; CHECK-LABEL: @popcnt3_16vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
   %2 = lshr <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
   %3 = and <8 x i16> %2, <i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845, i16 21845>
   %4 = sub <8 x i16> %0, %3
   %5 = lshr <8 x i16> %4, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %6 = and <8 x i16> %5, <i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107, i16 13107>
   %7 = mul <8 x i16> %6, <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>
   %8 = add <8 x i16> %7, %4
   %9 = lshr <8 x i16> %8, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
   %10 = add <8 x i16> %9, %8
   %11 = and <8 x i16> %10, <i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855, i16 3855>
   %12 = lshr <8 x i16> %11, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %13 = add nuw nsw <8 x i16> %12, %11
   %14 = and <8 x i16> %13, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
   ret <8 x i16> %14
 }

 ; 32-bit vector popcount (variant 3 - using multiply by -3) - 4 elements
 define <4 x i32> @popcnt3_32vec(<4 x i32> %0) {
 ; CHECK-LABEL: @popcnt3_32vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %2 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
   %3 = and <4 x i32> %2, <i32 1431655765, i32 1431655765, i32 1431655765, i32 1431655765>
   %4 = sub <4 x i32> %0, %3
   %5 = lshr <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2>
   %6 = and <4 x i32> %5, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
   %7 = mul <4 x i32> %6, <i32 -3, i32 -3, i32 -3, i32 -3>
   %8 = add <4 x i32> %7, %4
   %9 = lshr <4 x i32> %8, <i32 4, i32 4, i32 4, i32 4>
   %10 = add <4 x i32> %9, %8
   %11 = and <4 x i32> %10, <i32 252645135, i32 252645135, i32 252645135, i32 252645135>
   %12 = lshr <4 x i32> %11, <i32 8, i32 8, i32 8, i32 8>
   %13 = add nuw nsw <4 x i32> %12, %11
   %14 = lshr <4 x i32> %13, <i32 16, i32 16, i32 16, i32 16>
   %15 = add nuw nsw <4 x i32> %14, %13
   %16 = and <4 x i32> %15, <i32 63, i32 63, i32 63, i32 63>
   ret <4 x i32> %16
 }

 ; 64-bit vector popcount (variant 3 - using multiply by -3) - 2 elements
 define <2 x i64> @popcnt3_64vec(<2 x i64> %0) {
 ; CHECK-LABEL: @popcnt3_64vec(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP0:%.*]])
 ; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %2 = lshr <2 x i64> %0, <i64 1, i64 1>
   %3 = and <2 x i64> %2, <i64 6148914691236517205, i64 6148914691236517205>
   %4 = sub <2 x i64> %0, %3
   %5 = lshr <2 x i64> %4, <i64 2, i64 2>
   %6 = and <2 x i64> %5, <i64 3689348814741910323, i64 3689348814741910323>
   %7 = mul <2 x i64> %6, <i64 -3, i64 -3>
   %8 = add <2 x i64> %7, %4
   %9 = lshr <2 x i64> %8, <i64 4, i64 4>
   %10 = add <2 x i64> %9, %8
   %11 = and <2 x i64> %10, <i64 1085102592571150095, i64 1085102592571150095>
   %12 = lshr <2 x i64> %11, <i64 8, i64 8>
   %13 = add nuw nsw <2 x i64> %12, %11
   %14 = lshr <2 x i64> %13, <i64 16, i64 16>
   %15 = add nuw nsw <2 x i64> %14, %13
   %16 = lshr <2 x i64> %15, <i64 32, i64 32>
   %17 = add nuw nsw <2 x i64> %16, %15
   %18 = and <2 x i64> %17, <i64 127, i64 127>
   ret <2 x i64> %18
 }

 ; Negative test case for popcnt2 i8 - wrong constant (should NOT optimize)
 define i8 @popcnt2_negative_i8(i8 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i8(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], 85
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], 51
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], 51
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i8 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i8 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i8 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], 15
 ; CHECK-NEXT:    ret i8 [[TMP11]]
 ;
   %2 = lshr i8 %0, 1
   %3 = and i8 %2, 85
   %4 = sub i8 %0, %3
   %5 = and i8 %4, 51
   %6 = lshr i8 %4, 2
   %7 = and i8 %6, 51
   %8 = add nuw nsw i8 %7, %5
   %9 = lshr i8 %8, 4
   %10 = add nuw nsw i8 %9, %8
   %11 = and i8 %10, 15
   ret i8 %11
 }

 ; Negative test case for popcnt3 i8 - using wrong multiplier (should NOT optimize)
 define i8 @popcnt3_negative_i8(i8 noundef %0) {
 ; CHECK-LABEL: @popcnt3_negative_i8(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], 85
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i8 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP5]], 51
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i8 [[TMP6]], -3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i8 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], 15
 ; CHECK-NEXT:    ret i8 [[TMP11]]
 ;
   %2 = lshr i8 %0, 1
   %3 = and i8 %2, 85
   %4 = sub i8 %0, %3
   %5 = lshr i8 %4, 2
   %6 = and i8 %5, 51
   %7 = mul i8 %6, -3
   %8 = add i8 %7, %4
   %9 = lshr i8 %8, 4
   %10 = add i8 %9, %8
   %11 = and i8 %10, 15
   ret i8 %11
 }

 ; Negative test case for popcnt2 i32 - wrong constant in second step (should NOT optimize)
 define i32 @popcnt2_negative_i32(i32 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 1431655765
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 858993460
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP6]], 858993459
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i32 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 252645135
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i32 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i32 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 63
 ; CHECK-NEXT:    ret i32 [[TMP16]]
 ;
   %2 = lshr i32 %0, 1
   %3 = and i32 %2, 1431655765
   %4 = sub i32 %0, %3
   %5 = and i32 %4, 858993460  ; Wrong constant (should be 858993459)
   %6 = lshr i32 %4, 2
   %7 = and i32 %6, 858993459
   %8 = add nuw nsw i32 %7, %5
   %9 = lshr i32 %8, 4
   %10 = add nuw nsw i32 %9, %8
   %11 = and i32 %10, 252645135
   %12 = lshr i32 %11, 8
   %13 = add nuw nsw i32 %12, %11
   %14 = lshr i32 %13, 16
   %15 = add nuw nsw i32 %14, %13
   %16 = and i32 %15, 63
   ret i32 %16
 }

 ; Negative test case for popcnt3 i32 - using wrong multiplier (should NOT optimize)
 define i32 @popcnt3_negative_i32(i32 noundef %0) {
 ; CHECK-LABEL: @popcnt3_negative_i32(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 1431655765
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 858993459
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], -2
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 252645135
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i32 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i32 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 63
 ; CHECK-NEXT:    ret i32 [[TMP16]]
 ;
   %2 = lshr i32 %0, 1
   %3 = and i32 %2, 1431655765
   %4 = sub i32 %0, %3
   %5 = lshr i32 %4, 2
   %6 = and i32 %5, 858993459
   %7 = mul i32 %6, -2  ; Wrong multiplier (should be -3)
   %8 = add i32 %7, %4
   %9 = lshr i32 %8, 4
   %10 = add i32 %9, %8
   %11 = and i32 %10, 252645135
   %12 = lshr i32 %11, 8
   %13 = add nuw nsw i32 %12, %11
   %14 = lshr i32 %13, 16
   %15 = add nuw nsw i32 %14, %13
   %16 = and i32 %15, 63
   ret i32 %16
 }

 ; Negative test case for popcnt2 i24 - non-power-of-2 bit width (should NOT optimize)
 define i24 @popcnt2_negative_i24(i24 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i24(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i24 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i24 [[TMP2]], 5592405
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i24 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i24 [[TMP4]], 3355443
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i24 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i24 [[TMP6]], 3355443
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i24 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i24 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i24 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i24 [[TMP10]], 986895
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i24 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i24 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i24 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i24 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i24 [[TMP15]], 47
 ; CHECK-NEXT:    ret i24 [[TMP16]]
 ;
   %2 = lshr i24 %0, 1
   %3 = and i24 %2, u0x555555
   %4 = sub i24 %0, %3
   %5 = and i24 %4, u0x333333
   %6 = lshr i24 %4, 2
   %7 = and i24 %6, 3355443
   %8 = add nuw nsw i24 %7, %5
   %9 = lshr i24 %8, 4
   %10 = add nuw nsw i24 %9, %8
   %11 = and i24 %10, u0x0F0F0F
   %12 = lshr i24 %11, 8
   %13 = add nuw nsw i24 %12, %11
   %14 = lshr i24 %13, 16
   %15 = add nuw nsw i24 %14, %13
   %16 = and i24 %15, 47  ; 2*24-1 = 47
   ret i24 %16
 }

 ; Negative test case for popcnt2 i40 - non-power-of-2 bit width (should NOT optimize)
 define i40 @popcnt2_negative_i40(i40 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i40(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i40 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i40 [[TMP2]], 366503875925
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i40 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i40 [[TMP4]], 219902325555
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i40 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i40 [[TMP6]], 219902325555
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i40 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i40 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i40 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i40 [[TMP10]], 64677154575
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i40 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i40 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i40 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i40 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = lshr i40 [[TMP15]], 32
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i40 [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i40 [[TMP17]], 79
 ; CHECK-NEXT:    ret i40 [[TMP18]]
 ;
   %2 = lshr i40 %0, 1
   %3 = and i40 %2, u0x5555555555
   %4 = sub i40 %0, %3
   %5 = and i40 %4, u0x3333333333
   %6 = lshr i40 %4, 2
   %7 = and i40 %6, 219902325555
   %8 = add nuw nsw i40 %7, %5
   %9 = lshr i40 %8, 4
   %10 = add nuw nsw i40 %9, %8
   %11 = and i40 %10, u0x0F0F0F0F0F
   %12 = lshr i40 %11, 8
   %13 = add nuw nsw i40 %12, %11
   %14 = lshr i40 %13, 16
   %15 = add nuw nsw i40 %14, %13
   %16 = lshr i40 %15, 32
   %17 = add nuw nsw i40 %16, %15
   %18 = and i40 %17, 79  ; 2*40-1 = 79
   ret i40 %18
 }

 ; Negative test case for popcnt2 i48 - non-power-of-2 bit width (should NOT optimize)
 define i48 @popcnt2_negative_i48(i48 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i48(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i48 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i48 [[TMP2]], 93824992236885
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i48 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i48 [[TMP4]], 56294995342131
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i48 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i48 [[TMP6]], 56294995342131
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i48 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i48 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i48 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i48 [[TMP10]], 16557351571215
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i48 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i48 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i48 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i48 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = lshr i48 [[TMP15]], 32
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i48 [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i48 [[TMP17]], 95
 ; CHECK-NEXT:    ret i48 [[TMP18]]
 ;
   %2 = lshr i48 %0, 1
   %3 = and i48 %2, u0x555555555555
   %4 = sub i48 %0, %3
   %5 = and i48 %4, u0x333333333333
   %6 = lshr i48 %4, 2
   %7 = and i48 %6, u0x333333333333
   %8 = add nuw nsw i48 %7, %5
   %9 = lshr i48 %8, 4
   %10 = add nuw nsw i48 %9, %8
   %11 = and i48 %10, u0x0F0F0F0F0F0F
   %12 = lshr i48 %11, 8
   %13 = add nuw nsw i48 %12, %11
   %14 = lshr i48 %13, 16
   %15 = add nuw nsw i48 %14, %13
   %16 = lshr i48 %15, 32
   %17 = add nuw nsw i48 %16, %15
   %18 = and i48 %17, 95  ; 2*48-1 = 95
   ret i48 %18
 }

 ; Negative test case for popcnt2 i56 - non-power-of-2 bit width (should NOT optimize)
 define i56 @popcnt2_negative_i56(i56 noundef %0) {
 ; CHECK-LABEL: @popcnt2_negative_i56(
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i56 [[TMP0:%.*]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i56 [[TMP2]], 24019198012642645
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i56 [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i56 [[TMP4]], 14411518807585587
 ; CHECK-NEXT:    [[TMP6:%.*]] = lshr i56 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i56 [[TMP6]], 14411518807585587
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i56 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i56 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i56 [[TMP9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i56 [[TMP10]], 4238682002231055
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i56 [[TMP11]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i56 [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = lshr i56 [[TMP13]], 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i56 [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = lshr i56 [[TMP15]], 32
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i56 [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i56 [[TMP17]], 111
 ; CHECK-NEXT:    ret i56 [[TMP18]]
 ;
   %2 = lshr i56 %0, 1
   %3 = and i56 %2, u0x55555555555555
   %4 = sub i56 %0, %3
   %5 = and i56 %4, u0x33333333333333
   %6 = lshr i56 %4, 2
   %7 = and i56 %6, u0x33333333333333
   %8 = add nuw nsw i56 %7, %5
   %9 = lshr i56 %8, 4
   %10 = add nuw nsw i56 %9, %8
   %11 = and i56 %10, u0x0F0F0F0F0F0F0F
   %12 = lshr i56 %11, 8
   %13 = add nuw nsw i56 %12, %11
   %14 = lshr i56 %13, 16
   %15 = add nuw nsw i56 %14, %13
   %16 = lshr i56 %15, 32
   %17 = add nuw nsw i56 %16, %15
   %18 = and i56 %17, 111  ; 2*56-1 = 111
   ret i56 %18
 }