blob: fa21721f17762849961ce15f37f8a30897319287 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s
; https://alive2.llvm.org/ce/z/KuJPnU
define i64 @umulh(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; Commutative ops should match in any order. Ops where operand order has been
; reversed from above are marked 'commuted'. As per instcombine contributors
; guide, constants are always canonicalized to RHS, so don't bother commuting
; constants.
define i64 @umulh__commuted(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__commuted(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted
%y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted
; Add cross terms
%cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted
; Final result accumulation
%intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %carry, %intermediate ; commuted
%hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted
ret i64 %hw64
}
define i32 @mulh_src32(i32 %x, i32 %y) {
; Extract low and high 16 bits
; CHECK-LABEL: define i32 @mulh_src32(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[Y]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32
; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i64 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[TMP5]]
;
%x_lo = and i32 %x, u0xffff ; x & 0xffffffff
%y_lo = and i32 %y, u0xffff ; y & 0xffffffff
%x_hi = lshr i32 %x, 16 ; x >> 16
%y_hi = lshr i32 %y, 16 ; y >> 16
; Cross products
%y_lo_x_hi = mul nuw i32 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i32 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i32 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i32 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i32 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i32 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i32 u0x10000, i32 0 ; if overflow, add 1 << 16
; High 16 bits of low product
%y_lo_x_lo_hi = lshr i32 %y_lo_x_lo, 16
; Low and high 16 bits of cross_sum
%cross_sum_lo = and i32 %cross_sum, u0xffff
%cross_sum_hi = lshr i32 %cross_sum, 16
%low_accum = add nuw nsw i32 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i32 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i32 %low_accum, 16
%intermediate_plus_carry = add i32 %intermediate, %carry
%hw64 = add i32 %intermediate_plus_carry, %low_accum_hi
ret i32 %hw64
}
define i128 @mulh_src128(i128 %x, i128 %y) {
; Extract low and high 64 bits
; CHECK-LABEL: define i128 @mulh_src128(
; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[X]] to i256
; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[Y]] to i256
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128
; CHECK-NEXT: ret i128 [[HW64]]
;
%x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff
%y_lo = and i128 %y, u0xffffffffffffffff ; y & 0xffffffff
%x_hi = lshr i128 %x, 64 ; x >> 16
%y_hi = lshr i128 %y, 64 ; y >> 16
; Cross products
%y_lo_x_hi = mul nuw i128 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i128 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i128 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i128 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i128 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i128 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i128 u0x10000000000000000, i128 0 ; if overflow, add 1 << 16
; High 16 bits of low product
%y_lo_x_lo_hi = lshr i128 %y_lo_x_lo, 64
; Low and high 16 bits of cross_sum
%cross_sum_lo = and i128 %cross_sum, u0xffffffffffffffff
%cross_sum_hi = lshr i128 %cross_sum, 64
%low_accum = add nuw nsw i128 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i128 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i128 %low_accum, 64
%intermediate_plus_carry = add i128 %intermediate, %carry
%hw64 = add i128 %intermediate_plus_carry, %low_accum_hi
ret i128 %hw64
}
define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) {
; Extract low and high 16 bits
; CHECK-LABEL: define <2 x i32> @mulh_v2i32(
; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[X]] to <2 x i64>
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[Y]] to <2 x i64>
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32)
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[HW64]]
;
%x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
%y_lo = and <2 x i32> %y, <i32 u0xffff, i32 u0xffff>
%x_hi = lshr <2 x i32> %x, <i32 16, i32 16>
%y_hi = lshr <2 x i32> %y, <i32 16, i32 16>
; Cross products
%y_lo_x_hi = mul nuw <2 x i32> %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw <2 x i32> %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw <2 x i32> %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw <2 x i32> %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add <2 x i32> %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult <2 x i32> %cross_sum, %y_lo_x_hi
%carry = select <2 x i1> %carry_out, <2 x i32> <i32 u0x10000, i32 u0x10000>, <2 x i32> <i32 0, i32 0>
; High 16 bits of low product
%y_lo_x_lo_hi = lshr <2 x i32> %y_lo_x_lo, <i32 16, i32 16>
; Low and high 16 bits of cross_sum
%cross_sum_lo = and <2 x i32> %cross_sum, <i32 u0xffff, i32 u0xffff>
%cross_sum_hi = lshr <2 x i32> %cross_sum, <i32 16, i32 16>
%low_accum = add nuw nsw <2 x i32> %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw <2 x i32> %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr <2 x i32> %low_accum, <i32 16, i32 16>
%intermediate_plus_carry = add <2 x i32> %intermediate, %carry
%hw64 = add <2 x i32> %intermediate_plus_carry, %low_accum_hi
ret <2 x i32> %hw64
}
; https://alive2.llvm.org/ce/z/PPXtkR
define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
; Store high 64 bits
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
; Reconstruct low 64 bits
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
; Store low 64 bits
store i64 %lw64, ptr %p, align 8
ret void
}
; Negative tests
define i64 @umulh_notandx(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notandx(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967294 ; x & 0xfffffffe
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notandy(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notandy(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967294
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967294 ; y & 0xfffffffe
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notshiftx(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notshiftx(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 16
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 16 ; x >> 16
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notshifty(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notshifty(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 16 ; y >> 16
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notcarry(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notcarry(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967295, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967295, i64 0 ; if overflow, add wrong value
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notxlo(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notxlo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x ; y_lo * x
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
define i64 @umulh_notcrosssum(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_notcrosssum(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = shl i64 [[Y_HI_X_LO]], 1
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967294
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_hi_x_lo ; wrong crosssum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; Uses tests.
; 'x_lo' can have more than 2 uses.
define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
call void (...) @llvm.fake.use(i64 %x_lo)
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_hi' can have more than 2 uses.
define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
call void (...) @llvm.fake.use(i64 %y_hi)
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_hi * x_hi' must have no more than 2 uses.
define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_hi * x_hi' must have single use.
define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_hi * x_lo' must have single use.
define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses
; when doing both low/high parts. Doing the optimization when only doing the
; high part and there's a 2nd unrelated use here still results in less
; instructions and is likely profitable, so this seems ok.
define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[TMP5]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'cross_sum' must have no more than 3 uses.
define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
call void (...) @llvm.fake.use(i64 %cross_sum)
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'carry_out' must have single use.
define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__carry_out(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
call void (...) @llvm.fake.use(i1 %carry_out)
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'carry' must have single use.
define i64 @umulh__mul_use__carry(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__carry(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]])
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
call void (...) @llvm.fake.use(i64 %carry)
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'y_lo_x_lo_hi' must have single use.
define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'cross_sum_lo' must have single use.
define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
call void (...) @llvm.fake.use(i64 %cross_sum_lo)
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'cross_sum_hi' must have single use.
define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
call void (...) @llvm.fake.use(i64 %cross_sum_hi)
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'low_accum' has a single use if only doing high part of multiply and 2 uses
; when doing both low/high parts. Unrelated use here, but still seems
; profitable.
define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__low_accum(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul i64 [[Y_HI]], [[X]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[TMP5]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
call void (...) @llvm.fake.use(i64 %low_accum)
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'intermediate' must have single use.
define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__intermediate(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE]])
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
call void (...) @llvm.fake.use(i64 %intermediate)
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'low_accum_hi' must have single use.
define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
call void (...) @llvm.fake.use(i64 %low_accum_hi)
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
ret i64 %hw64
}
; 'intermediate_plus_carry' must have single use.
define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE_PLUS_CARRY]])
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
%x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
%y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
%x_hi = lshr i64 %x, 32 ; x >> 32
%y_hi = lshr i64 %y, 32 ; y >> 32
; Cross products
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
; Add cross terms
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
; Carry if overflowed
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
; High 32 bits of low product
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
; Low and high 32 bits of cross_sum
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
; Final result accumulation
%intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
%low_accum_hi = lshr i64 %low_accum, 32
%intermediate_plus_carry = add i64 %intermediate, %carry
%hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
call void (...) @llvm.fake.use(i64 %intermediate_plus_carry)
ret i64 %hw64
}
; 'x_lo' can have multiple uses.
define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
call void (...) @llvm.fake.use(i64 %x_lo)
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_lo' can have multiple uses.
define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
call void (...) @llvm.fake.use(i64 %y_lo)
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'x_hi' can have multiple uses.
define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
call void (...) @llvm.fake.use(i64 %x_hi)
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_hi' can have multiple uses.
define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
call void (...) @llvm.fake.use(i64 %y_hi)
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_lo_x_hi' must have exactly 2 uses.
define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_hi_x_hi' must have single use.
define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_hi_x_lo' must have single use.
define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_lo_x_lo' we allow multiple uses on y_lo_x_lo.
; TODO does not simplify like it should?
define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul i64 [[Y_HI]], [[X]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM1:%.*]] = shl i64 [[TMP6]], 32
; CHECK-NEXT: [[LW64:%.*]] = add i64 [[Y_LO_X_LO]], [[LOW_ACCUM1]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'cross_sum' must have no more than 3 uses.
define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
call void (...) @llvm.fake.use(i64 %cross_sum)
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'carry_out' must have single use.
define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
call void (...) @llvm.fake.use(i1 %carry_out)
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'carry' must have single use.
define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__carry(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]])
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
call void (...) @llvm.fake.use(i64 %carry)
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'y_lo_x_lo_hi' must have single use.
define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'cross_sum_lo' must have single use.
define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
call void (...) @llvm.fake.use(i64 %cross_sum_lo)
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'cross_sum_hi' must have single use.
define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
call void (...) @llvm.fake.use(i64 %cross_sum_hi)
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'low_accum' must have exactly 2 uses if doing high multiply.
define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
call void (...) @llvm.fake.use(i64 %low_accum)
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'upper_mid' must have single use.
define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[UPPER_MID]])
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP9]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
call void (...) @llvm.fake.use(i64 %upper_mid)
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'low_accum_hi' must have single use.
define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
call void (...) @llvm.fake.use(i64 %low_accum_hi)
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'upper_mid_with_cross' must have single use.
define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
call void (...) @llvm.fake.use(i64 %low_accum_hi)
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}
; 'low_accum_shifted' can have multiple uses.
define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = and i64 [[LW64]], -4294967296
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]])
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
%x_lo = and i64 %x, 4294967295
%y_lo = and i64 %y, 4294967295
%x_hi = lshr i64 %x, 32
%y_hi = lshr i64 %y, 32
%y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
%y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
%y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
%y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
%cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
%carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
%carry = select i1 %carry_out, i64 4294967296, i64 0
%y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
%cross_sum_lo = and i64 %cross_sum, 4294967295
%cross_sum_hi = lshr i64 %cross_sum, 32
%low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
%upper_mid = add nuw i64 %y_hi_x_hi, %carry
%low_accum_hi = lshr i64 %low_accum, 32
%upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
%hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
%hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
store i64 %hw64, ptr %hi_ptr, align 8
%low_accum_shifted = shl i64 %low_accum, 32
call void (...) @llvm.fake.use(i64 %low_accum_shifted)
%y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
%lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
store i64 %lw64, ptr %p, align 8
ret void
}