blob: fbf6d90e624bcb53a00be3efe159c41542449e5c [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_sext_i32_i16 s0, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s2
; GCN-NEXT: s_mul_i32 s1, s1, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_mul_i32 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_add_i32 s1, s1, s0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s3, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s3, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX7-LABEL: v_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_mul_i32 s7, s1, s3
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_add_u32 s7, s7, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s4
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0
; GFX7-NEXT: s_mul_i32 s7, s1, s4
; GFX7-NEXT: s_mul_i32 s2, s2, s3
; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX7-NEXT: s_cselect_b32 s8, 1, 0
; GFX7-NEXT: s_mul_i32 s6, s0, s3
; GFX7-NEXT: s_mul_i32 s5, s0, s5
; GFX7-NEXT: s_add_i32 s0, s2, s7
; GFX7-NEXT: s_add_i32 s0, s0, s5
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: s_and_b32 s8, s8, 1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: s_mov_b32 s0, s6
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_mul_i32 s7, s1, s3
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_add_u32 s7, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0
; GFX8-NEXT: s_mul_i32 s7, s1, s4
; GFX8-NEXT: s_mul_i32 s2, s2, s3
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
; GFX8-NEXT: s_mul_i32 s6, s0, s3
; GFX8-NEXT: s_mul_i32 s5, s0, s5
; GFX8-NEXT: s_add_i32 s0, s2, s7
; GFX8-NEXT: s_add_i32 s0, s0, s5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: s_and_b32 s8, s8, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: s_mov_b32 s0, s6
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s7, s1, s3
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
; GFX9-NEXT: s_and_b32 s8, s8, 1
; GFX9-NEXT: s_add_u32 s7, s7, s9
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
; GFX9-NEXT: s_and_b32 s9, s9, 1
; GFX9-NEXT: s_add_i32 s8, s8, s9
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_mul_i32 s5, s0, s5
; GFX9-NEXT: s_add_i32 s2, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s2, s2, s5
; GFX9-NEXT: s_mul_i32 s6, s0, s3
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s2, s0, s8
; GFX9-NEXT: s_mov_b32 s0, s6
; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s6, s1, s3
; GFX10-NEXT: s_mul_i32 s7, s0, s4
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3
; GFX10-NEXT: s_add_u32 s6, s6, s7
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_mul_i32 s9, s1, s4
; GFX10-NEXT: s_and_b32 s7, s7, 1
; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_add_u32 s6, s6, s8
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_mul_i32 s5, s0, s5
; GFX10-NEXT: s_add_i32 s2, s2, s9
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX10-NEXT: s_add_i32 s2, s2, s5
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s7, s7, s8
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s3
; GFX10-NEXT: s_add_i32 s2, s1, s7
; GFX10-NEXT: s_mov_b32 s1, s6
; GFX10-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
}
define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX7-LABEL: v_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX7-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v9
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v8
; GFX7-NEXT: v_mov_b32_e32 v0, v6
; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v8
; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: v_mov_b32_e32 v1, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v2, v2, v10
; GFX9-NEXT: v_add_u32_e32 v3, v8, v9
; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1
; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3
; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: v_mov_b32_e32 v1, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4
; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9
; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v3, v11, v6
; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s9, s1, s4
; GFX7-NEXT: s_mul_i32 s10, s0, s5
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1
; GFX7-NEXT: s_mul_i32 s9, s2, s4
; GFX7-NEXT: s_mul_i32 s10, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT: s_mul_i32 s11, s0, s6
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: s_add_u32 s9, s9, s11
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: s_cselect_b32 s11, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2
; GFX7-NEXT: s_and_b32 s11, s11, 1
; GFX7-NEXT: s_add_i32 s10, s10, s11
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: s_mul_i32 s5, s2, s5
; GFX7-NEXT: s_mul_i32 s3, s3, s4
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_mul_i32 s9, s1, s6
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: s_mul_i32 s7, s0, s7
; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX7-NEXT: s_add_i32 s0, s3, s5
; GFX7-NEXT: s_add_i32 s0, s0, s9
; GFX7-NEXT: s_add_i32 s0, s0, s7
; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: s_mov_b32 s0, s8
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s9, s1, s4
; GFX8-NEXT: s_mul_i32 s10, s0, s5
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1
; GFX8-NEXT: s_mul_i32 s9, s2, s4
; GFX8-NEXT: s_mul_i32 s10, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT: s_mul_i32 s11, s0, s6
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: s_add_u32 s9, s9, s11
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2
; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: s_add_i32 s10, s10, s11
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_mul_i32 s5, s2, s5
; GFX8-NEXT: s_mul_i32 s3, s3, s4
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_mul_i32 s9, s1, s6
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: s_mul_i32 s7, s0, s7
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX8-NEXT: s_add_i32 s0, s3, s5
; GFX8-NEXT: s_add_i32 s0, s0, s9
; GFX8-NEXT: s_add_i32 s0, s0, s7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: s_mov_b32 s0, s8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s10, s0, s5
; GFX9-NEXT: s_add_u32 s9, s9, s10
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
; GFX9-NEXT: s_and_b32 s10, s10, 1
; GFX9-NEXT: s_add_u32 s9, s9, s11
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s10, s10, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s4
; GFX9-NEXT: s_mul_i32 s12, s1, s5
; GFX9-NEXT: s_add_u32 s11, s11, s12
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
; GFX9-NEXT: s_mul_i32 s13, s0, s6
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_add_u32 s11, s11, s13
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s15
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s12, s12, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s5
; GFX9-NEXT: s_mul_i32 s3, s3, s4
; GFX9-NEXT: s_mul_i32 s13, s1, s6
; GFX9-NEXT: s_add_i32 s3, s3, s11
; GFX9-NEXT: s_mul_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s3, s3, s13
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX9-NEXT: s_add_i32 s3, s3, s7
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s3, s0, s12
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_mov_b32 s2, s10
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s8, s1, s4
; GFX10-NEXT: s_mul_i32 s9, s0, s5
; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4
; GFX10-NEXT: s_add_u32 s8, s8, s9
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_mul_i32 s11, s1, s5
; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: s_add_u32 s8, s8, s10
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_mul_i32 s12, s0, s6
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4
; GFX10-NEXT: s_add_i32 s9, s9, s10
; GFX10-NEXT: s_mul_i32 s10, s2, s4
; GFX10-NEXT: s_mul_i32 s3, s3, s4
; GFX10-NEXT: s_add_u32 s10, s10, s11
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_mul_i32 s7, s0, s7
; GFX10-NEXT: s_and_b32 s11, s11, 1
; GFX10-NEXT: s_add_u32 s10, s10, s12
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_i32 s13, s1, s6
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_mul_i32 s12, s2, s5
; GFX10-NEXT: s_add_u32 s9, s10, s9
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_add_i32 s3, s3, s12
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX10-NEXT: s_add_i32 s3, s3, s13
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_add_i32 s3, s3, s7
; GFX10-NEXT: s_add_i32 s11, s11, s10
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_mul_i32 s0, s0, s4
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, s9
; GFX10-NEXT: s_add_i32 s3, s1, s11
; GFX10-NEXT: s_mov_b32 s1, s8
; GFX10-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
}
define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX7-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX7-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX7-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v15
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GFX7-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v0, v11
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX8-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX8-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX8-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v12
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v14
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v15
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v12, v11
; GFX8-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v11
; GFX8-NEXT: v_mov_b32_e32 v0, v8
; GFX8-NEXT: v_mov_b32_e32 v1, v9
; GFX8-NEXT: v_mov_b32_e32 v2, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v10, v10, v11
; GFX9-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX9-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v15
; GFX9-NEXT: v_add3_u32 v12, v12, v13, v14
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v11, v12, v13, v11
; GFX9-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX9-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v12
; GFX9-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX9-NEXT: v_add3_u32 v3, v1, v0, v11
; GFX9-NEXT: v_mov_b32_e32 v0, v8
; GFX9-NEXT: v_mov_b32_e32 v1, v9
; GFX9-NEXT: v_mov_b32_e32 v2, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v11, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v12, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX10-NEXT: v_add_co_u32 v8, s4, v8, v9
; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v11
; GFX10-NEXT: v_mul_hi_u32 v11, v1, v4
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v13, s4, v8, v13
; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v18, s4, v13, v11
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5
; GFX10-NEXT: v_add_co_u32 v11, s4, v18, v15
; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4
; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13
; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9
; GFX10-NEXT: v_add_nc_u32_e32 v10, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v10, v10, v13, v7
; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5
; GFX10-NEXT: v_add3_u32 v1, v10, v15, v1
; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s17, s1, s8
; GFX7-NEXT: s_mul_i32 s18, s0, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1
; GFX7-NEXT: s_mul_i32 s17, s2, s8
; GFX7-NEXT: s_mul_i32 s18, s1, s9
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX7-NEXT: s_mul_i32 s19, s0, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mov_b32_e32 v3, s9
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: s_mul_i32 s17, s3, s8
; GFX7-NEXT: s_mul_i32 s18, s2, s9
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: s_mul_i32 s19, s1, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s20, s0, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8
; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6
; GFX7-NEXT: s_mul_i32 s17, s4, s8
; GFX7-NEXT: s_mul_i32 s18, s3, s9
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GFX7-NEXT: s_mul_i32 s19, s2, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: s_mul_i32 s20, s1, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX7-NEXT: s_mul_i32 s21, s0, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
; GFX7-NEXT: s_mul_i32 s17, s5, s8
; GFX7-NEXT: s_mul_i32 s18, s4, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: s_mul_i32 s19, s3, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_mul_i32 s20, s2, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: s_mul_i32 s21, s1, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v7, s4
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX7-NEXT: s_mul_i32 s22, s0, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
; GFX7-NEXT: s_mul_i32 s17, s6, s8
; GFX7-NEXT: s_mul_i32 s18, s5, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s19, s4, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v12, s12
; GFX7-NEXT: s_mul_i32 s20, s3, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s21, s2, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GFX7-NEXT: s_mul_i32 s22, s1, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: v_mov_b32_e32 v8, s5
; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s23, s0, s14
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s23
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17
; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v15, s13
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mov_b32_e32 v13, s14
; GFX7-NEXT: s_mul_i32 s7, s7, s8
; GFX7-NEXT: s_mul_i32 s17, s6, s9
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GFX7-NEXT: s_mul_i32 s16, s0, s8
; GFX7-NEXT: s_mul_i32 s5, s5, s10
; GFX7-NEXT: s_mul_i32 s15, s0, s15
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13
; GFX7-NEXT: s_add_i32 s0, s7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_mul_i32 s4, s4, s11
; GFX7-NEXT: s_add_i32 s0, s0, s5
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; GFX7-NEXT: v_mov_b32_e32 v10, s6
; GFX7-NEXT: s_mul_i32 s11, s3, s12
; GFX7-NEXT: s_add_i32 s0, s0, s4
; GFX7-NEXT: s_mul_i32 s12, s2, s13
; GFX7-NEXT: s_add_i32 s0, s0, s11
; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX7-NEXT: s_mul_i32 s13, s1, s14
; GFX7-NEXT: s_add_i32 s0, s0, s12
; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX7-NEXT: s_add_i32 s0, s0, s13
; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX7-NEXT: s_add_i32 s0, s0, s15
; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s0, v10
; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v3
; GFX7-NEXT: v_readfirstlane_b32 s5, v4
; GFX7-NEXT: v_readfirstlane_b32 s6, v5
; GFX7-NEXT: v_readfirstlane_b32 s7, v6
; GFX7-NEXT: s_mov_b32 s0, s16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s17, s1, s8
; GFX8-NEXT: s_mul_i32 s18, s0, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1
; GFX8-NEXT: s_mul_i32 s17, s2, s8
; GFX8-NEXT: s_mul_i32 s18, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX8-NEXT: s_mul_i32 s19, s0, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: s_mul_i32 s17, s3, s8
; GFX8-NEXT: s_mul_i32 s18, s2, s9
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: s_mul_i32 s19, s1, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s20, s0, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8
; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6
; GFX8-NEXT: s_mul_i32 s17, s4, s8
; GFX8-NEXT: s_mul_i32 s18, s3, s9
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5
; GFX8-NEXT: s_mul_i32 s19, s2, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: s_mul_i32 s20, s1, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX8-NEXT: s_mul_i32 s21, s0, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
; GFX8-NEXT: s_mul_i32 s17, s5, s8
; GFX8-NEXT: s_mul_i32 s18, s4, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: s_mul_i32 s19, s3, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_mul_i32 s20, s2, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: s_mul_i32 s21, s1, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_mov_b32_e32 v7, s4
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX8-NEXT: s_mul_i32 s22, s0, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
; GFX8-NEXT: s_mul_i32 s17, s6, s8
; GFX8-NEXT: s_mul_i32 s18, s5, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s19, s4, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: s_mul_i32 s20, s3, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s21, s2, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; GFX8-NEXT: s_mul_i32 s22, s1, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: v_mov_b32_e32 v8, s5
; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s23, s0, s14
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s23
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17
; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mov_b32_e32 v13, s14
; GFX8-NEXT: s_mul_i32 s7, s7, s8
; GFX8-NEXT: s_mul_i32 s17, s6, s9
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: s_mul_i32 s16, s0, s8
; GFX8-NEXT: s_mul_i32 s5, s5, s10
; GFX8-NEXT: s_mul_i32 s15, s0, s15
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13
; GFX8-NEXT: s_add_i32 s0, s7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_mul_i32 s4, s4, s11
; GFX8-NEXT: s_add_i32 s0, s0, s5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
; GFX8-NEXT: v_mov_b32_e32 v10, s6
; GFX8-NEXT: s_mul_i32 s11, s3, s12
; GFX8-NEXT: s_add_i32 s0, s0, s4
; GFX8-NEXT: s_mul_i32 s12, s2, s13
; GFX8-NEXT: s_add_i32 s0, s0, s11
; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX8-NEXT: s_mul_i32 s13, s1, s14
; GFX8-NEXT: s_add_i32 s0, s0, s12
; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX8-NEXT: s_add_i32 s0, s0, s13
; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX8-NEXT: s_add_i32 s0, s0, s15
; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v10
; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: v_readfirstlane_b32 s4, v3
; GFX8-NEXT: v_readfirstlane_b32 s5, v4
; GFX8-NEXT: v_readfirstlane_b32 s6, v5
; GFX8-NEXT: v_readfirstlane_b32 s7, v6
; GFX8-NEXT: s_mov_b32 s0, s16
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s17, s1, s8
; GFX9-NEXT: s_mul_i32 s18, s0, s9
; GFX9-NEXT: s_add_u32 s17, s17, s18
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s19, s0, s8
; GFX9-NEXT: s_and_b32 s18, s18, 1
; GFX9-NEXT: s_add_u32 s17, s17, s19
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s18, s18, s19
; GFX9-NEXT: s_mul_i32 s19, s2, s8
; GFX9-NEXT: s_mul_i32 s20, s1, s9
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_mul_i32 s21, s0, s10
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s18, s19, s18
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s20, s20, s19
; GFX9-NEXT: s_mul_i32 s19, s3, s8
; GFX9-NEXT: s_mul_i32 s21, s2, s9
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_i32 s22, s1, s10
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_i32 s23, s0, s11
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s24
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s25
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s0, s10
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s26
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_i32 s21, s21, s20
; GFX9-NEXT: s_mul_i32 s20, s4, s8
; GFX9-NEXT: s_mul_i32 s22, s3, s9
; GFX9-NEXT: s_add_u32 s20, s20, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_mul_i32 s23, s2, s10
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_u32 s20, s20, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s24, s1, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s24
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s25, s0, s12
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s25
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s26
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s27
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s28
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s0, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s29
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s22, s22, s21
; GFX9-NEXT: s_mul_i32 s21, s5, s8
; GFX9-NEXT: s_mul_i32 s23, s4, s9
; GFX9-NEXT: s_add_u32 s21, s21, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_mul_i32 s24, s3, s10
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_u32 s21, s21, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s25, s2, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s25
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s26, s1, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s26
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s27, s0, s13
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s27
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s28
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s29
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s30
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s31
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s33
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s23, s23, s22
; GFX9-NEXT: s_mul_i32 s22, s6, s8
; GFX9-NEXT: s_mul_i32 s24, s5, s9
; GFX9-NEXT: s_add_u32 s22, s22, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_mul_i32 s25, s4, s10
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_u32 s22, s22, s25
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s26, s3, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s26
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s27, s2, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s27
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s28, s1, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s28
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s29, s0, s14
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s29
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s30
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s31
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s33
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s34
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s35
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s36
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s24, s24, s23
; GFX9-NEXT: s_mul_i32 s23, s6, s9
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_i32 s25, s5, s10
; GFX9-NEXT: s_add_i32 s7, s7, s23
; GFX9-NEXT: s_mul_i32 s26, s4, s11
; GFX9-NEXT: s_add_i32 s7, s7, s25
; GFX9-NEXT: s_mul_i32 s27, s3, s12
; GFX9-NEXT: s_add_i32 s7, s7, s26
; GFX9-NEXT: s_mul_i32 s28, s2, s13
; GFX9-NEXT: s_add_i32 s7, s7, s27
; GFX9-NEXT: s_mul_i32 s29, s1, s14
; GFX9-NEXT: s_add_i32 s7, s7, s28
; GFX9-NEXT: s_mul_i32 s15, s0, s15
; GFX9-NEXT: s_add_i32 s7, s7, s29
; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX9-NEXT: s_add_i32 s7, s7, s15
; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX9-NEXT: s_add_i32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s5, s6, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX9-NEXT: s_add_i32 s4, s5, s4
; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX9-NEXT: s_add_i32 s3, s4, s3
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX9-NEXT: s_mul_i32 s16, s0, s8
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s7, s0, s24
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s4, s20
; GFX9-NEXT: s_mov_b32 s5, s21
; GFX9-NEXT: s_mov_b32 s6, s22
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s16, s1, s8
; GFX10-NEXT: s_mul_i32 s17, s0, s9
; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8
; GFX10-NEXT: s_add_u32 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mul_i32 s19, s1, s9
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_add_u32 s16, s16, s18
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s0, s10
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8
; GFX10-NEXT: s_add_i32 s17, s17, s18
; GFX10-NEXT: s_mul_i32 s18, s2, s8
; GFX10-NEXT: s_mul_i32 s22, s0, s11
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s11
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s0, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s25, s4, s9
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s26, s2, s11
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s1, s10
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s27, s0, s13
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s17, s18, s17
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s2, s9
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10-NEXT: s_add_i32 s19, s19, s18
; GFX10-NEXT: s_mul_i32 s18, s3, s8
; GFX10-NEXT: s_mul_i32 s7, s7, s8
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s15, s0, s15
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s2, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s3, s9
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_i32 s20, s20, s19
; GFX10-NEXT: s_mul_i32 s19, s4, s8
; GFX10-NEXT: s_add_u32 s19, s19, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_u32 s19, s19, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s5, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s3, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s1, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_i32 s21, s21, s20
; GFX10-NEXT: s_add_u32 s23, s23, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_u32 s23, s23, s24
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s24, s25, s24
; GFX10-NEXT: s_add_u32 s23, s23, s26
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s22, s23, s22
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s22, s22, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s20, s22, s20
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s6, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s22, s23, s22
; GFX10-NEXT: s_add_u32 s20, s20, s28
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s5, s9
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s26
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s4, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s25
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s3, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s27
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s2, s12
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s13
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s22, s22, s21
; GFX10-NEXT: s_add_u32 s21, s24, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s0, s14
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s26, 1, 0
; GFX10-NEXT: s_and_b32 s26, s26, 1
; GFX10-NEXT: s_add_i32 s24, s24, s26
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s23
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s6, s9
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s5, s10
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s4, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_i32 s24, s3, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_i32 s25, s2, s13
; GFX10-NEXT: s_add_i32 s7, s7, s27
; GFX10-NEXT: s_mul_i32 s26, s1, s14
; GFX10-NEXT: s_add_i32 s7, s7, s24
; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX10-NEXT: s_add_i32 s7, s7, s15
; GFX10-NEXT: s_add_i32 s6, s7, s6
; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: s_mov_b32 s6, s21
; GFX10-NEXT: s_add_i32 s4, s5, s4
; GFX10-NEXT: s_mov_b32 s5, s20
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_and_b32 s3, s22, 1
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s23, s23, s3
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s8
; GFX10-NEXT: s_add_i32 s7, s1, s23
; GFX10-NEXT: s_mov_b32 s1, s16
; GFX10-NEXT: s_mov_b32 s2, s17
; GFX10-NEXT: s_mov_b32 s3, s18
; GFX10-NEXT: s_mov_b32 s4, s19
; GFX10-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
}
define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX7-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX7-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX7-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX7-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v17
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v18
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v17, v18
; GFX7-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX7-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v24, v23
; GFX7-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX7-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX7-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX7-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX7-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX7-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX7-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v0, v23
; GFX7-NEXT: v_mov_b32_e32 v0, v22
; GFX7-NEXT: v_mov_b32_e32 v1, v16
; GFX7-NEXT: v_mov_b32_e32 v2, v17
; GFX7-NEXT: v_mov_b32_e32 v3, v18
; GFX7-NEXT: v_mov_b32_e32 v4, v19
; GFX7-NEXT: v_mov_b32_e32 v5, v20
; GFX7-NEXT: v_mov_b32_e32 v6, v21
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX8-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX8-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX8-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX8-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v17
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v18
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v18
; GFX8-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v17
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX8-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v24, v23
; GFX8-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX8-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX8-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX8-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX8-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX8-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v23
; GFX8-NEXT: v_mov_b32_e32 v0, v22
; GFX8-NEXT: v_mov_b32_e32 v1, v16
; GFX8-NEXT: v_mov_b32_e32 v2, v17
; GFX8-NEXT: v_mov_b32_e32 v3, v18
; GFX8-NEXT: v_mov_b32_e32 v4, v19
; GFX8-NEXT: v_mov_b32_e32 v5, v20
; GFX8-NEXT: v_mov_b32_e32 v6, v21
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v16, v2, v8
; GFX9-NEXT: v_mul_lo_u32 v17, v1, v9
; GFX9-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX9-NEXT: v_mul_hi_u32 v19, v1, v8
; GFX9-NEXT: v_mul_lo_u32 v20, v1, v8
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v17
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v18
; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v16, v19
; GFX9-NEXT: v_mul_lo_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v17, v18, v16
; GFX9-NEXT: v_mul_hi_u32 v16, v0, v8
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v20, v21
; GFX9-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_add_u32_e32 v17, v20, v17
; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v20, v19
; GFX9-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v22
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v22
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v8
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v20, v22
; GFX9-NEXT: v_mul_hi_u32 v21, v1, v9
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v22, v21
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v19, v20, v21, v19
; GFX9-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v0, v11
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v22, v20
; GFX9-NEXT: v_mul_lo_u32 v21, v5, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v23, v21
; GFX9-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v11
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v24, v25, v23
; GFX9-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v23, v24, v22
; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9
; GFX9-NEXT: v_mul_lo_u32 v24, v4, v11
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX9-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX9-NEXT: v_add_u32_e32 v7, v7, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11
; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v15
; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24
; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8
; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5
; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9
; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11
; GFX9-NEXT: v_add3_u32 v7, v2, v0, v22
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v16
; GFX9-NEXT: v_mov_b32_e32 v2, v17
; GFX9-NEXT: v_mov_b32_e32 v3, v18
; GFX9-NEXT: v_mov_b32_e32 v4, v19
; GFX9-NEXT: v_mov_b32_e32 v5, v20
; GFX9-NEXT: v_mov_b32_e32 v6, v21
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX10-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX10-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX10-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v22, v3, v8
; GFX10-NEXT: v_mul_lo_u32 v25, v1, v10
; GFX10-NEXT: v_mul_hi_u32 v23, v0, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20
; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18
; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v22, v20
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v21
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v25
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23
; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9
; GFX10-NEXT: v_add3_u32 v19, v24, v19, v21
; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v21
; GFX10-NEXT: v_add3_u32 v21, v26, v24, v25
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23
; GFX10-NEXT: v_add3_u32 v18, v19, v29, v18
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add3_u32 v30, v21, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10
; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v21
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v18, s4, v20, v18
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v19, v30, v23, v20
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26
; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v35, v23, v30, v21
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v20, v35, v25, v20
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v34, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9
; GFX10-NEXT: v_add_co_u32 v27, s4, v31, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v27, v29
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28
; GFX10-NEXT: v_add_co_u32 v31, s5, v21, v26
; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24
; GFX10-NEXT: v_add_co_u32 v21, s5, v31, v29
; GFX10-NEXT: v_add3_u32 v39, v23, v33, v26
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12
; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27
; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v34, s4, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v35, v28, v35, v29
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v27
; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_add_co_u32 v34, s4, v23, v24
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v35, v30, v32
; GFX10-NEXT: v_add3_u32 v21, v39, v26, v21
; GFX10-NEXT: v_add_co_u32 v34, s4, v34, v27
; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v31
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v22, v28, v27
; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25
; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX10-NEXT: v_add_co_u32 v30, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v33, v22, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28
; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v30, v27
; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14
; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13
; GFX10-NEXT: v_add3_u32 v33, v33, v26, v27
; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12
; GFX10-NEXT: v_add3_u32 v26, v7, v29, v28
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13
; GFX10-NEXT: v_add3_u32 v7, v26, v24, v15
; GFX10-NEXT: v_add_co_u32 v11, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5
; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v21
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v5, v33, v10, v7
; GFX10-NEXT: v_add3_u32 v3, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_add3_u32 v7, v3, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: v_mov_b32_e32 v4, v19
; GFX10-NEXT: v_mov_b32_e32 v5, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}