blob: c71b892ab6a6ebff4cb1c1f5c6ef4a99628b6a56 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -O0 < %s | FileCheck %s
; Regression tests for scalar fshl/fshr with constant shift amounts that
; caused miscompilation via the scalar S_LSHR_B64 fast path:
;
; - fshl(scalar, const, 0): the FSHL->FSHR conversion computed ~0 = -1 as the
; inverted shift, which (being < 32 in a signed comparison) bypassed the
; S_AND_B32 masking and caused S_LSHR_B64 to shift by 63 instead of 0.
;
; - fshl(scalar, const, 1): same path computes -Z = -1 as the inverted shift;
; same buggy immediate pattern matched and skipped masking.
;
; - fshl(scalar, const, 16): same path computes -16 as the inverted shift;
; -16 also satisfies Imm < 32 signed and would bypass S_AND_B32 masking.
;
; - fshl(scalar, scalar, 8): same lowering applies when both inputs are uniform
; scalars; -8 would bypass S_AND_B32 masking.
;
; Correct behaviour:
; fshl(X, Y, 0) == X
; fshl(X, Y, 1) == (X << 1) | (Y >> 31)
; fshl(X, Y, 16) == (X << 16) | (Y >> 16)
; fshl(X, Y, 8) == (X << 8) | (Y >> 24)
; fshl(scalar, const, 0) must return scalar unchanged.
define void @fshl_scalar_const_shift0(ptr addrspace(1) %out, i32 %x) {
; CHECK-LABEL: fshl_scalar_const_shift0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v3, v1
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v3
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: s_mov_b32 s3, 1
; CHECK-NEXT: s_lshr_b32 s2, s4, s3
; CHECK-NEXT: s_mov_b32 s0, 0x5040305
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s1, s4
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_and_b32 s2, s2, 31
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
%result = call i32 @llvm.fshl.i32(i32 %scalar, i32 84148997, i32 0)
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; fshl(scalar, const, 1): inverted shift is 31 (= -1 masked to 31), which is
; correct. The bug was that -1 matched ShiftAmt32Imm (Imm < 32, signed) and
; skipped the S_AND_B32 masking, passing -1 raw to S_LSHR_B64.
define void @fshl_scalar_const_shift1(ptr addrspace(1) %out, i32 %x) {
; CHECK-LABEL: fshl_scalar_const_shift1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v3, v1
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v3
; CHECK-NEXT: v_readfirstlane_b32 s2, v2
; CHECK-NEXT: s_mov_b32 s0, 0xf2f2f2f2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_and_b32 s2, s2, 31
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
%result = call i32 @llvm.fshl.i32(i32 %scalar, i32 -218959118, i32 1)
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; fshl(scalar, const, 16): inverted shift is -16, which also satisfied
; Imm < 32 signed and bypassed S_AND_B32 masking before the fix.
define void @fshl_scalar_const_shift16(ptr addrspace(1) %out, i32 %x) {
; CHECK-LABEL: fshl_scalar_const_shift16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v3, v1
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v3
; CHECK-NEXT: v_readfirstlane_b32 s2, v2
; CHECK-NEXT: s_mov_b32 s0, 0xf2f2f2f2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: s_mov_b32 s2, -16
; CHECK-NEXT: s_and_b32 s2, s2, 31
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
%result = call i32 @llvm.fshl.i32(i32 %scalar, i32 -218959118, i32 16)
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
; fshl(scalar, scalar, 8): both inputs uniform; inverted shift is -8, which
; also satisfied Imm < 32 signed and bypassed S_AND_B32 masking before the fix.
define void @fshl_scalar_scalar_shift8(ptr addrspace(1) %out, i32 %x, i32 %y) {
; CHECK-LABEL: fshl_scalar_scalar_shift8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v4, v1
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; CHECK-NEXT: v_mov_b32_e32 v1, v4
; CHECK-NEXT: v_readfirstlane_b32 s2, v2
; CHECK-NEXT: v_readfirstlane_b32 s0, v3
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: s_mov_b32 s2, -8
; CHECK-NEXT: s_and_b32 s2, s2, 31
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%sx = call i32 @llvm.amdgcn.readfirstlane(i32 %x)
%sy = call i32 @llvm.amdgcn.readfirstlane(i32 %y)
%result = call i32 @llvm.fshl.i32(i32 %sx, i32 %sy, i32 8)
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}