[AMDGPU][GISel] Smaller code for scalar 32 to 64-bit extensions
Differential Revision: https://reviews.llvm.org/D107639
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
index 24faa2c..0dcbab0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir
@@ -225,10 +225,9 @@
; GCN: liveins: $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]]
+ ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]].sub0, 31, implicit-def $scc
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s64) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_SEXT_INREG %0, 32
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
index 1056cc4..ec95662 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
@@ -127,10 +127,9 @@
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]]
+ ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]], 31, implicit-def $scc
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s64) = G_SEXT %0
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
index 86ac8f5..8f18f58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
@@ -127,10 +127,9 @@
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
- ; GCN-NEXT: [[S_BFE_U64_:%[0-9]+]]:sreg_64 = S_BFE_U64 [[REG_SEQUENCE]], 2097152, implicit-def $scc
- ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_U64_]]
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s64) = G_ZEXT %0
$sgpr0_sgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 8f5e9b7..7295442 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -252,11 +252,11 @@
define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s5, s4, 31
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -266,11 +266,11 @@
;
; GFX7-LABEL: mubuf_store_sgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -285,8 +285,8 @@
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -295,8 +295,8 @@
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -310,8 +310,8 @@
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -320,8 +320,8 @@
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -336,8 +336,8 @@
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -346,8 +346,8 @@
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
@@ -698,11 +698,11 @@
define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_sgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s5, s4, 31
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_mov_b32_e32 v1, s5
@@ -712,11 +712,11 @@
;
; GFX7-LABEL: mubuf_load_sgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -731,8 +731,8 @@
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -741,8 +741,8 @@
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -756,8 +756,8 @@
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -766,8 +766,8 @@
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -782,8 +782,8 @@
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX6-NEXT: s_ashr_i32 s3, s2, 31
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
@@ -792,8 +792,8 @@
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
-; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GFX7-NEXT: s_ashr_i32 s3, s2, 31
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 0bd33b9..8f300f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -120,14 +120,14 @@
; GCN-LABEL: s_shl_i64_zext_i32_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
+; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow:
; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b32 s1, 0
; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
-; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i32 %x, 2147483647
@@ -187,14 +187,14 @@
; GCN-LABEL: s_shl_i64_sext_i32_overflow:
; GCN: ; %bb.0:
; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; GCN-NEXT: s_ashr_i32 s1, s0, 31
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
-; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i32 %x, 2147483647
@@ -434,9 +434,10 @@
; GCN-NEXT: s_brev_b32 s2, -4
; GCN-NEXT: s_mov_b32 s3, s2
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_mov_b32 s4, s1
+; GCN-NEXT: s_mov_b32 s5, s3
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GCN-NEXT: ; return to shader part epilog
@@ -446,11 +447,12 @@
; GFX10PLUS-NEXT: s_brev_b32 s2, -4
; GFX10PLUS-NEXT: s_mov_b32 s3, s2
; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT: s_mov_b32 s2, s1
-; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
-; GFX10PLUS-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_mov_b32 s4, s1
+; GFX10PLUS-NEXT: s_mov_b32 s5, s3
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
%ext = zext <2 x i32> %and to <2 x i64>
@@ -525,9 +527,10 @@
; GCN-NEXT: s_brev_b32 s2, -8
; GCN-NEXT: s_mov_b32 s3, s2
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000
+; GCN-NEXT: s_ashr_i32 s3, s0, 31
+; GCN-NEXT: s_mov_b32 s2, s0
+; GCN-NEXT: s_ashr_i32 s5, s1, 31
+; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GCN-NEXT: ; return to shader part epilog
@@ -537,11 +540,12 @@
; GFX10PLUS-NEXT: s_brev_b32 s2, -8
; GFX10PLUS-NEXT: s_mov_b32 s3, s2
; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT: s_mov_b32 s2, s1
-; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000
-; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10PLUS-NEXT: s_ashr_i32 s3, s0, 31
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s5, s1, 31
+; GFX10PLUS-NEXT: s_mov_b32 s4, s1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and <2 x i32> %x, <i32 536870911, i32 536870911>
%ext = sext <2 x i32> %and to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index acf24a6..ddc6734 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -664,11 +664,11 @@
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9c4d7ae..0d02bf8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -448,10 +448,10 @@
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_mov_b32 s1, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
-; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index dd26102..fd8c739 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -572,11 +572,11 @@
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3]
; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
-; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 198c180..ba3ed97 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -549,12 +549,12 @@
; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_mov_b32 s5, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3]
-; GFX9-GISEL-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone