| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -O0 < %s | FileCheck %s |
| |
| ; Regression tests for scalar fshl/fshr with constant shift amounts that |
| ; caused miscompilation via the scalar S_LSHR_B64 fast path: |
| ; |
| ; - fshl(scalar, const, 0): the FSHL->FSHR conversion computed ~0 = -1 as the |
| ; inverted shift, which (being < 32 in a signed comparison) bypassed the |
| ; S_AND_B32 masking and caused S_LSHR_B64 to shift by 63 instead of 0. |
| ; |
| ; - fshl(scalar, const, 1): same path computes -Z = -1 as the inverted shift; |
| ; same buggy immediate pattern matched and skipped masking. |
| ; |
| ; - fshl(scalar, const, 16): same path computes -16 as the inverted shift; |
| ; -16 also satisfies Imm < 32 signed and would bypass S_AND_B32 masking. |
| ; |
| ; - fshl(scalar, scalar, 8): same lowering applies when both inputs are uniform |
| ; scalars; -8 would bypass S_AND_B32 masking. |
| ; |
| ; Correct behaviour: |
| ; fshl(X, Y, 0) == X |
| ; fshl(X, Y, 1) == (X << 1) | (Y >> 31) |
| ; fshl(X, Y, 16) == (X << 16) | (Y >> 16) |
| ; fshl(X, Y, 8) == (X << 8) | (Y >> 24) |
| |
| ; fshl(scalar, const, 0) must return scalar unchanged. |
| define void @fshl_scalar_const_shift0(ptr addrspace(1) %out, i32 %x) { |
| ; CHECK-LABEL: fshl_scalar_const_shift0: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v1 |
| ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v3 |
| ; CHECK-NEXT: v_readfirstlane_b32 s4, v2 |
| ; CHECK-NEXT: s_mov_b32 s3, 1 |
| ; CHECK-NEXT: s_lshr_b32 s2, s4, s3 |
| ; CHECK-NEXT: s_mov_b32 s0, 0x5040305 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; CHECK-NEXT: s_mov_b32 s1, s4 |
| ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; CHECK-NEXT: s_mov_b32 s1, s2 |
| ; CHECK-NEXT: s_mov_b32 s2, -1 |
| ; CHECK-NEXT: s_and_b32 s2, s2, 31 |
| ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: global_store_dword v[0:1], v2, off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x) |
| %result = call i32 @llvm.fshl.i32(i32 %scalar, i32 84148997, i32 0) |
| store i32 %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| ; fshl(scalar, const, 1): inverted shift is 31 (= -1 masked to 31), which is |
| ; correct. The bug was that -1 matched ShiftAmt32Imm (Imm < 32, signed) and |
| ; skipped the S_AND_B32 masking, passing -1 raw to S_LSHR_B64. |
| define void @fshl_scalar_const_shift1(ptr addrspace(1) %out, i32 %x) { |
| ; CHECK-LABEL: fshl_scalar_const_shift1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v1 |
| ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v3 |
| ; CHECK-NEXT: v_readfirstlane_b32 s2, v2 |
| ; CHECK-NEXT: s_mov_b32 s0, 0xf2f2f2f2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; CHECK-NEXT: s_mov_b32 s1, s2 |
| ; CHECK-NEXT: s_mov_b32 s2, -1 |
| ; CHECK-NEXT: s_and_b32 s2, s2, 31 |
| ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: global_store_dword v[0:1], v2, off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x) |
| %result = call i32 @llvm.fshl.i32(i32 %scalar, i32 -218959118, i32 1) |
| store i32 %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| ; fshl(scalar, const, 16): inverted shift is -16, which also satisfied |
| ; Imm < 32 signed and bypassed S_AND_B32 masking before the fix. |
| define void @fshl_scalar_const_shift16(ptr addrspace(1) %out, i32 %x) { |
| ; CHECK-LABEL: fshl_scalar_const_shift16: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v1 |
| ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v3 |
| ; CHECK-NEXT: v_readfirstlane_b32 s2, v2 |
| ; CHECK-NEXT: s_mov_b32 s0, 0xf2f2f2f2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; CHECK-NEXT: s_mov_b32 s1, s2 |
| ; CHECK-NEXT: s_mov_b32 s2, -16 |
| ; CHECK-NEXT: s_and_b32 s2, s2, 31 |
| ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: global_store_dword v[0:1], v2, off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %scalar = call i32 @llvm.amdgcn.readfirstlane(i32 %x) |
| %result = call i32 @llvm.fshl.i32(i32 %scalar, i32 -218959118, i32 16) |
| store i32 %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| ; fshl(scalar, scalar, 8): both inputs uniform; inverted shift is -8, which |
| ; also satisfied Imm < 32 signed and bypassed S_AND_B32 masking before the fix. |
| define void @fshl_scalar_scalar_shift8(ptr addrspace(1) %out, i32 %x, i32 %y) { |
| ; CHECK-LABEL: fshl_scalar_scalar_shift8: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_mov_b32_e32 v4, v1 |
| ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v4 |
| ; CHECK-NEXT: v_readfirstlane_b32 s2, v2 |
| ; CHECK-NEXT: v_readfirstlane_b32 s0, v3 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 |
| ; CHECK-NEXT: s_mov_b32 s1, s2 |
| ; CHECK-NEXT: s_mov_b32 s2, -8 |
| ; CHECK-NEXT: s_and_b32 s2, s2, 31 |
| ; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 |
| ; CHECK-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s0 |
| ; CHECK-NEXT: global_store_dword v[0:1], v2, off |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %sx = call i32 @llvm.amdgcn.readfirstlane(i32 %x) |
| %sy = call i32 @llvm.amdgcn.readfirstlane(i32 %y) |
| %result = call i32 @llvm.fshl.i32(i32 %sx, i32 %sy, i32 8) |
| store i32 %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |