test/Transforms/AggressiveInstCombine/umulh_carry4.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s

 ; https://alive2.llvm.org/ce/z/KuJPnU
 define i64 @umulh(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; Commutative ops should match in any order. Ops where operand order has been
 ; reversed from above are marked 'commuted'. As per instcombine contributors
 ; guide, constants are always canonicalized to RHS, so don't bother commuting
 ; constants.
 define i64 @umulh__commuted(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__commuted(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo       ; commuted
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi       ; commuted
   %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo       ; commuted

   ; Add cross terms
   %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted

   ; Final result accumulation
   %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted
   %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted

   ret i64 %hw64
 }

 define i32 @mulh_src32(i32 %x, i32 %y) {
   ; Extract low and high 16 bits
 ; CHECK-LABEL: define i32 @mulh_src32(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP3]], 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw i64 [[TMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
   %x_lo = and i32 %x, u0xffff              ; x & 0xffffffff
   %y_lo = and i32 %y, u0xffff              ; y & 0xffffffff
   %x_hi = lshr i32 %x, 16                     ; x >> 16
   %y_hi = lshr i32 %y, 16                     ; y >> 16

   ; Cross products
   %y_lo_x_hi = mul nuw i32 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i32 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i32 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i32 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i32 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i32 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i32 u0x10000, i32 0 ; if overflow, add 1 << 16

   ; High 16 bits of low product
   %y_lo_x_lo_hi = lshr i32 %y_lo_x_lo, 16

   ; Low and high 16 bits of cross_sum
   %cross_sum_lo = and i32 %cross_sum, u0xffff
   %cross_sum_hi = lshr i32 %cross_sum, 16

   %low_accum = add nuw nsw i32 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i32 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i32 %low_accum, 16
   %intermediate_plus_carry = add i32 %intermediate, %carry
   %hw64 = add i32 %intermediate_plus_carry, %low_accum_hi

   ret i32 %hw64
 }

 define i128 @mulh_src128(i128 %x, i128 %y) {
   ; Extract low and high 64 bits
 ; CHECK-LABEL: define i128 @mulh_src128(
 ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i128 [[X]] to i256
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i128 [[Y]] to i256
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i256 [[TMP3]], 128
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128
 ; CHECK-NEXT:    ret i128 [[HW64]]
 ;
   %x_lo = and i128 %x, u0xffffffffffffffff              ; x & 0xffffffff
   %y_lo = and i128 %y, u0xffffffffffffffff              ; y & 0xffffffff
   %x_hi = lshr i128 %x, 64                     ; x >> 16
   %y_hi = lshr i128 %y, 64                     ; y >> 16

   ; Cross products
   %y_lo_x_hi = mul nuw i128 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i128 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i128 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i128 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i128 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i128 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i128 u0x10000000000000000, i128 0 ; if overflow, add 1 << 16

   ; High 16 bits of low product
   %y_lo_x_lo_hi = lshr i128 %y_lo_x_lo, 64

   ; Low and high 16 bits of cross_sum
   %cross_sum_lo = and i128 %cross_sum, u0xffffffffffffffff
   %cross_sum_hi = lshr i128 %cross_sum, 64

   %low_accum = add nuw nsw i128 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i128 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i128 %low_accum, 64
   %intermediate_plus_carry = add i128 %intermediate, %carry
   %hw64 = add i128 %intermediate_plus_carry, %low_accum_hi

   ret i128 %hw64
 }

 define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) {
   ; Extract low and high 16 bits
 ; CHECK-LABEL: define <2 x i32> @mulh_v2i32(
 ; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[X]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[Y]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32)
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[HW64]]
 ;
   %x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
   %y_lo = and <2 x i32> %y, <i32 u0xffff, i32 u0xffff>
   %x_hi = lshr <2 x i32> %x, <i32 16, i32 16>
   %y_hi = lshr <2 x i32> %y, <i32 16, i32 16>

   ; Cross products
   %y_lo_x_hi = mul nuw <2 x i32> %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw <2 x i32> %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw <2 x i32> %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw <2 x i32> %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add <2 x i32> %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult <2 x i32> %cross_sum, %y_lo_x_hi
   %carry = select <2 x i1> %carry_out, <2 x i32> <i32 u0x10000, i32 u0x10000>, <2 x i32> <i32 0, i32 0>

   ; High 16 bits of low product
   %y_lo_x_lo_hi = lshr <2 x i32> %y_lo_x_lo, <i32 16, i32 16>

   ; Low and high 16 bits of cross_sum
   %cross_sum_lo = and <2 x i32> %cross_sum, <i32 u0xffff, i32 u0xffff>
   %cross_sum_hi = lshr <2 x i32> %cross_sum, <i32 16, i32 16>

   %low_accum = add nuw nsw <2 x i32> %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw <2 x i32> %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr <2 x i32> %low_accum, <i32 16, i32 16>
   %intermediate_plus_carry = add <2 x i32> %intermediate, %carry
   %hw64 = add <2 x i32> %intermediate_plus_carry, %low_accum_hi

   ret <2 x i32> %hw64
 }

 ; https://alive2.llvm.org/ce/z/PPXtkR
 define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    store i64 [[TMP8]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   ; Store high 64 bits
   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   ; Reconstruct low 64 bits
   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   ; Store low 64 bits
   store i64 %lw64, ptr %p, align 8

   ret void
 }


 ; Negative tests

 define i64 @umulh_notandx(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notandx(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967294
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967294              ; x & 0xfffffffe
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notandy(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notandy(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967294
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967294              ; y & 0xfffffffe
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notshiftx(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notshiftx(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 16
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 16                     ; x >> 16
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notshifty(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notshifty(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 16
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 16                     ; y >> 16

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notcarry(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notcarry(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967295, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967295, i64 0 ; if overflow, add wrong value

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notxlo(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notxlo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x          ; y_lo * x

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 define i64 @umulh_notcrosssum(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh_notcrosssum(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = shl i64 [[Y_HI_X_LO]], 1
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967294
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_hi_x_lo ; wrong crosssum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }


 ; Uses tests.

 ; 'x_lo' can have more than 2 uses.
 define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[X_LO]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   call void (...) @llvm.fake.use(i64 %x_lo)
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_hi' can have more than 2 uses.
 define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32
   call void (...) @llvm.fake.use(i64 %y_hi)

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_hi * x_hi' must have no more than 2 uses.
 define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_hi * x_hi' must have single use.
 define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_hi * x_lo' must have single use.
 define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses
 ; when doing both low/high parts. Doing the optimization when only doing the
 ; high part and there's a 2nd unrelated use here still results in less
 ; instructions and is likely profitable, so this seems ok.
 define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
   call void (...) @llvm.fake.use(i64 %y_lo_x_lo)

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'cross_sum' must have no more than 3 uses.
 define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   call void (...) @llvm.fake.use(i64 %cross_sum)

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'carry_out' must have single use.
 define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__carry_out(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   call void (...) @llvm.fake.use(i1 %carry_out)
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'carry' must have single use.
 define i64 @umulh__mul_use__carry(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__carry(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CARRY]])
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
   call void (...) @llvm.fake.use(i64 %carry)

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'y_lo_x_lo_hi' must have single use.
 define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
   call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'cross_sum_lo' must have single use.
 define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   call void (...) @llvm.fake.use(i64 %cross_sum_lo)
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'cross_sum_hi' must have single use.
 define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32
   call void (...) @llvm.fake.use(i64 %cross_sum_hi)

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'low_accum' has a single use if only doing high part of multiply and 2 uses
 ; when doing both low/high parts. Unrelated use here, but still seems
 ; profitable.
 define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__low_accum(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul i64 [[Y_HI]], [[X]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
   call void (...) @llvm.fake.use(i64 %low_accum)

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'intermediate' must have single use.
 define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__intermediate(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[INTERMEDIATE]])
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   call void (...) @llvm.fake.use(i64 %intermediate)
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'low_accum_hi' must have single use.
 define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   call void (...) @llvm.fake.use(i64 %low_accum_hi)
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi

   ret i64 %hw64
 }

 ; 'intermediate_plus_carry' must have single use.
 define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) {
 ; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[INTERMEDIATE_PLUS_CARRY]])
 ; CHECK-NEXT:    ret i64 [[HW64]]
 ;
   ; Extract low and high 32 bits
   %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
   %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
   %x_hi = lshr i64 %x, 32                     ; x >> 32
   %y_hi = lshr i64 %y, 32                     ; y >> 32

   ; Cross products
   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo

   ; Add cross terms
   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum

   ; Carry if overflowed
   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32

   ; High 32 bits of low product
   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   ; Low and high 32 bits of cross_sum
   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   ; Final result accumulation
   %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
   %low_accum_hi = lshr i64 %low_accum, 32
   %intermediate_plus_carry = add i64 %intermediate, %carry
   %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
   call void (...) @llvm.fake.use(i64 %intermediate_plus_carry)

   ret i64 %hw64
 }


 ; 'x_lo' can have multiple uses.
 define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[X_LO]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LW64:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   call void (...) @llvm.fake.use(i64 %x_lo)
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_lo' can have multiple uses.
 define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LW64:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   call void (...) @llvm.fake.use(i64 %y_lo)
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'x_hi' can have multiple uses.
 define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[X_HI]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LW64:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   call void (...) @llvm.fake.use(i64 %x_hi)
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_hi' can have multiple uses.
 define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LW64:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32
   call void (...) @llvm.fake.use(i64 %y_hi)

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_lo_x_hi' must have exactly 2 uses.
 define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_hi_x_hi' must have single use.
 define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_hi_x_lo' must have single use.
 define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_lo_x_lo' we allow multiple uses on y_lo_x_lo.
 ; TODO does not simplify like it should?
 define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = mul i64 [[Y_HI]], [[X]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM1:%.*]] = shl i64 [[TMP6]], 32
 ; CHECK-NEXT:    [[LW64:%.*]] = add i64 [[Y_LO_X_LO]], [[LOW_ACCUM1]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
   call void (...) @llvm.fake.use(i64 %y_lo_x_lo)

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'cross_sum' must have no more than 3 uses.
 define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
   call void (...) @llvm.fake.use(i64 %cross_sum)

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'carry_out' must have single use.
 define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   call void (...) @llvm.fake.use(i1 %carry_out)
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'carry' must have single use.
 define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__carry(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CARRY]])
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0
   call void (...) @llvm.fake.use(i64 %carry)

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'y_lo_x_lo_hi' must have single use.
 define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
   call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'cross_sum_lo' must have single use.
 define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   call void (...) @llvm.fake.use(i64 %cross_sum_lo)
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'cross_sum_hi' must have single use.
 define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32
   call void (...) @llvm.fake.use(i64 %cross_sum_hi)

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'low_accum' must have exactly 2 uses if doing high multiply.
 define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
   call void (...) @llvm.fake.use(i64 %low_accum)

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'upper_mid' must have single use.
 define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[UPPER_MID]])
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   call void (...) @llvm.fake.use(i64 %upper_mid)
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'low_accum_hi' must have single use.
 define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   call void (...) @llvm.fake.use(i64 %low_accum_hi)
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'upper_mid_with_cross' must have single use.
 define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
 ; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
 ; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
 ; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
 ; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
 ; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
 ; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
 ; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
 ; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
 ; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
 ; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
 ; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
 ; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
 ; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
 ; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
 ; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
 ; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
 ; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
 ; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   call void (...) @llvm.fake.use(i64 %low_accum_hi)
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }

 ; 'low_accum_shifted' can have multiple uses.
 define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) {
 ; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
 ; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
 ; CHECK-NEXT:    [[LW64:%.*]] = mul i64 [[X]], [[Y]]
 ; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = and i64 [[LW64]], -4294967296
 ; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]])
 ; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %x_lo = and i64 %x, 4294967295
   %y_lo = and i64 %y, 4294967295
   %x_hi = lshr i64 %x, 32
   %y_hi = lshr i64 %y, 32

   %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
   %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
   %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
   %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo

   %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi

   %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
   %carry = select i1 %carry_out, i64 4294967296, i64 0

   %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32

   %cross_sum_lo = and i64 %cross_sum, 4294967295
   %cross_sum_hi = lshr i64 %cross_sum, 32

   %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi

   %upper_mid = add nuw i64 %y_hi_x_hi, %carry
   %low_accum_hi = lshr i64 %low_accum, 32
   %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
   %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi

   %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
   store i64 %hw64, ptr %hi_ptr, align 8

   %low_accum_shifted = shl i64 %low_accum, 32
   call void (...) @llvm.fake.use(i64 %low_accum_shifted)
   %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
   %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo

   store i64 %lw64, ptr %p, align 8

   ret void
 }