| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 |
| ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | FileCheck %s |
| |
| declare hidden float @_Z3powff(float, float) |
| declare hidden double @_Z3powdd(double, double) |
| declare hidden half @_Z3powDhDh(half, half) |
| |
| declare hidden float @_Z4powrff(float, float) |
| declare hidden double @_Z4powrdd(double, double) |
| declare hidden half @_Z4powrDhDh(half, half) |
| |
| declare hidden float @_Z4pownfi(float, i32) |
| declare hidden double @_Z4powndi(double, i32) |
| declare hidden half @_Z4pownDhi(half, i32) |
| |
| ; -------------------------------------------------------------------- |
| ; test pow |
| ; -------------------------------------------------------------------- |
| |
| define half @test_pow_fast_f16(half %x, half %y) { |
| ; CHECK-LABEL: test_pow_fast_f16: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12 |
| ; CHECK-NEXT: s_setpc_b64 s[16:17] |
| %pow = tail call fast half @_Z3powDhDh(half %x, half %y) |
| ret half %pow |
| } |
| |
| define float @test_pow_fast_f32(float %x, float %y) { |
| ; CHECK-LABEL: test_pow_fast_f32: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12 |
| ; CHECK-NEXT: s_setpc_b64 s[16:17] |
| %pow = tail call fast float @_Z3powff(float %x, float %y) |
| ret float %pow |
| } |
| |
| define double @test_pow_fast_f64(double %x, double %y) { |
| ; CHECK-LABEL: test_pow_fast_f64: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12 |
| ; CHECK-NEXT: s_setpc_b64 s[16:17] |
| %pow = tail call fast double @_Z3powdd(double %x, double %y) |
| ret double %pow |
| } |
| |
| define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) { |
| ; CHECK-LABEL: test_pow_fast_f16__integral_y: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: v_log_f16_e64 v3, |v0| |
| ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1 |
| ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 |
| ; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 |
| ; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 |
| ; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| ; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 |
| ; CHECK-NEXT: v_exp_f16_e32 v2, v2 |
| ; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = sitofp i32 %y.i to half |
| %pow = tail call fast half @_Z3powDhDh(half %x, half %y) |
| ret half %pow |
| } |
| |
| define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { |
| ; CHECK-LABEL: test_pow_fast_f32__integral_y: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: s_mov_b32 s4, 0x800000 |
| ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc |
| ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 |
| ; CHECK-NEXT: v_log_f32_e32 v3, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 |
| ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 |
| ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc |
| ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 |
| ; CHECK-NEXT: v_exp_f32_e32 v2, v2 |
| ; CHECK-NEXT: v_not_b32_e32 v3, 63 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 |
| ; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 |
| ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = sitofp i32 %y.i to float |
| %pow = tail call fast float @_Z3powff(float %x, float %y) |
| ret float %pow |
| } |
| |
| define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { |
| ; CHECK-LABEL: test_pow_fast_f64__integral_y: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 |
| ; CHECK-NEXT: v_writelane_b32 v43, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v43, s35, 3 |
| ; CHECK-NEXT: v_writelane_b32 v43, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v43, s37, 5 |
| ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x800 |
| ; CHECK-NEXT: v_writelane_b32 v43, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v43, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s50, 10 |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_writelane_b32 v43, s51, 11 |
| ; CHECK-NEXT: v_mov_b32_e32 v42, v1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s52, 12 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_writelane_b32 v43, s53, 13 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v31 |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v2 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 |
| ; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; CHECK-NEXT: v_readlane_b32 s53, v43, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v43, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v43, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v43, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v43, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v43, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v43, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v43, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_mov_b32 s33, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = sitofp i32 %y.i to double |
| %pow = tail call fast double @_Z3powdd(double %x, double %y) |
| ret double %pow |
| } |
| |
| ; -------------------------------------------------------------------- |
| ; test powr |
| ; -------------------------------------------------------------------- |
| |
| define half @test_powr_fast_f16(half %x, half %y) { |
| ; CHECK-LABEL: test_powr_fast_f16: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_log_f16_e32 v0, v0 |
| ; CHECK-NEXT: v_mul_f16_e32 v0, v1, v0 |
| ; CHECK-NEXT: v_exp_f16_e32 v0, v0 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %powr = tail call fast half @_Z4powrDhDh(half %x, half %y) |
| ret half %powr |
| } |
| |
| define float @test_powr_fast_f32(float %x, float %y) { |
| ; CHECK-LABEL: test_powr_fast_f32: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s4, 0x800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v0, v0, v3 |
| ; CHECK-NEXT: v_log_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 |
| ; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_mul_f32_e32 v2, v1, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc |
| ; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2 |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_not_b32_e32 v1, 63 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v0, v0, v1 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %powr = tail call fast float @_Z4powrff(float %x, float %y) |
| ret float %powr |
| } |
| |
| define double @test_powr_fast_f64(double %x, double %y) { |
| ; CHECK-LABEL: test_powr_fast_f64: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 |
| ; CHECK-NEXT: v_writelane_b32 v43, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v43, s35, 3 |
| ; CHECK-NEXT: v_writelane_b32 v43, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v43, s37, 5 |
| ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x800 |
| ; CHECK-NEXT: v_writelane_b32 v43, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v43, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s50, 10 |
| ; CHECK-NEXT: v_writelane_b32 v43, s51, 11 |
| ; CHECK-NEXT: v_writelane_b32 v43, s52, 12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_writelane_b32 v43, s53, 13 |
| ; CHECK-NEXT: v_mov_b32_e32 v42, v31 |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v3 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v2 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mul_f64 v[0:1], v[40:41], v[0:1] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v42 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_readlane_b32 s53, v43, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v43, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v43, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v43, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v43, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v43, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v43, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v43, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_mov_b32 s33, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %powr = tail call fast double @_Z4powrdd(double %x, double %y) |
| ret double %powr |
| } |
| |
| ; -------------------------------------------------------------------- |
| ; test pown |
| ; -------------------------------------------------------------------- |
| |
| define half @test_pown_fast_f16(half %x, i32 %y) { |
| ; CHECK-LABEL: test_pown_fast_f16: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 |
| ; CHECK-NEXT: v_log_f16_e64 v3, |v0| |
| ; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 |
| ; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 |
| ; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 |
| ; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 |
| ; CHECK-NEXT: v_exp_f16_e32 v2, v2 |
| ; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) |
| ret half %call |
| } |
| |
| define float @test_pown_fast_f32(float %x, i32 %y) { |
| ; CHECK-LABEL: test_pown_fast_f32: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s4, 0x800000 |
| ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 |
| ; CHECK-NEXT: v_log_f32_e32 v3, v3 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 |
| ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 |
| ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc |
| ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 |
| ; CHECK-NEXT: v_exp_f32_e32 v2, v2 |
| ; CHECK-NEXT: v_not_b32_e32 v3, 63 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 |
| ; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 |
| ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %call = tail call fast float @_Z4pownfi(float %x, i32 %y) |
| ret float %call |
| } |
| |
| define double @test_pown_fast_f64(double %x, i32 %y) { |
| ; CHECK-LABEL: test_pown_fast_f64: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 |
| ; CHECK-NEXT: v_writelane_b32 v43, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v43, s35, 3 |
| ; CHECK-NEXT: v_writelane_b32 v43, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v43, s37, 5 |
| ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x800 |
| ; CHECK-NEXT: v_writelane_b32 v43, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v43, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s50, 10 |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_writelane_b32 v43, s51, 11 |
| ; CHECK-NEXT: v_mov_b32_e32 v42, v1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s52, 12 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_writelane_b32 v43, s53, 13 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v31 |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v2 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41 |
| ; CHECK-NEXT: v_and_b32_e32 v2, v2, v42 |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; CHECK-NEXT: v_readlane_b32 s53, v43, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v43, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v43, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v43, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v43, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v43, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v43, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v43, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_mov_b32 s33, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %call = tail call fast double @_Z4powndi(double %x, i32 %y) |
| ret double %call |
| } |
| |
| define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f16_known_even: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: v_log_f16_e64 v0, |v0| |
| ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1 |
| ; CHECK-NEXT: v_exp_f16_e32 v0, v0 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = shl i32 %y.arg, 1 |
| %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) |
| ret half %call |
| } |
| |
| define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f32_known_even: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s4, 0x800000 |
| ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v0, |v0|, v3 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_log_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2 |
| ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1 |
| ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 |
| ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc |
| ; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 |
| ; CHECK-NEXT: v_exp_f32_e32 v0, v0 |
| ; CHECK-NEXT: v_not_b32_e32 v1, 63 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v0, v0, v1 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = shl i32 %y.arg, 1 |
| %call = tail call fast float @_Z4pownfi(float %x, i32 %y) |
| ret float %call |
| } |
| |
| define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f64_known_even: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| ; CHECK-NEXT: v_writelane_b32 v42, s16, 14 |
| ; CHECK-NEXT: v_writelane_b32 v42, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v42, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v42, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v42, s35, 3 |
| ; CHECK-NEXT: v_writelane_b32 v42, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v42, s37, 5 |
| ; CHECK-NEXT: v_writelane_b32 v42, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v42, s39, 7 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x400 |
| ; CHECK-NEXT: v_writelane_b32 v42, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v42, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: v_writelane_b32 v42, s50, 10 |
| ; CHECK-NEXT: v_writelane_b32 v42, s51, 11 |
| ; CHECK-NEXT: v_writelane_b32 v42, s52, 12 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_writelane_b32 v42, s53, 13 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v31 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v41, 1, v2 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_readlane_b32 s53, v42, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v42, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v42, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v42, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v42, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v42, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v42, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v42, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v42, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v42, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v42, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v42, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v42, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v42, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s4, v42, 14 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_mov_b32 s33, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = shl i32 %y.arg, 1 |
| %call = tail call fast double @_Z4powndi(double %x, i32 %y) |
| ret double %call |
| } |
| |
| define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f16_known_odd: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: v_log_f16_e64 v2, |v0| |
| ; CHECK-NEXT: s_movk_i32 s4, 0x7fff |
| ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1 |
| ; CHECK-NEXT: v_exp_f16_e32 v1, v1 |
| ; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = or i32 %y.arg, 1 |
| %call = tail call fast half @_Z4pownDhi(half %x, i32 %y) |
| ret half %call |
| } |
| |
| define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f32_known_odd: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s4, 0x800000 |
| ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 |
| ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc |
| ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 |
| ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 |
| ; CHECK-NEXT: v_log_f32_e32 v3, v3 |
| ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 |
| ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1 |
| ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 |
| ; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000 |
| ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc |
| ; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3 |
| ; CHECK-NEXT: v_exp_f32_e32 v1, v1 |
| ; CHECK-NEXT: v_not_b32_e32 v2, 63 |
| ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc |
| ; CHECK-NEXT: s_brev_b32 s4, -2 |
| ; CHECK-NEXT: v_ldexp_f32 v1, v1, v2 |
| ; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = or i32 %y.arg, 1 |
| %call = tail call fast float @_Z4pownfi(float %x, i32 %y) |
| ret float %call |
| } |
| |
| define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { |
| ; CHECK-LABEL: test_pown_fast_f64_known_odd: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 |
| ; CHECK-NEXT: v_writelane_b32 v43, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s31, 1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s34, 2 |
| ; CHECK-NEXT: v_writelane_b32 v43, s35, 3 |
| ; CHECK-NEXT: v_writelane_b32 v43, s36, 4 |
| ; CHECK-NEXT: v_writelane_b32 v43, s37, 5 |
| ; CHECK-NEXT: v_writelane_b32 v43, s38, 6 |
| ; CHECK-NEXT: v_writelane_b32 v43, s39, 7 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x800 |
| ; CHECK-NEXT: v_writelane_b32 v43, s48, 8 |
| ; CHECK-NEXT: v_writelane_b32 v43, s49, 9 |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: v_writelane_b32 v43, s50, 10 |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: v_writelane_b32 v43, s51, 11 |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v1 |
| ; CHECK-NEXT: v_writelane_b32 v43, s52, 12 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_writelane_b32 v43, s53, 13 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v31 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_mov_b32 s52, s13 |
| ; CHECK-NEXT: s_mov_b32 s53, s12 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: v_or_b32_e32 v42, 1, v2 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 |
| ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s53 |
| ; CHECK-NEXT: s_mov_b32 s13, s52 |
| ; CHECK-NEXT: s_mov_b32 s14, s51 |
| ; CHECK-NEXT: s_mov_b32 s15, s50 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v41 |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; CHECK-NEXT: v_readlane_b32 s53, v43, 13 |
| ; CHECK-NEXT: v_readlane_b32 s52, v43, 12 |
| ; CHECK-NEXT: v_readlane_b32 s51, v43, 11 |
| ; CHECK-NEXT: v_readlane_b32 s50, v43, 10 |
| ; CHECK-NEXT: v_readlane_b32 s49, v43, 9 |
| ; CHECK-NEXT: v_readlane_b32 s48, v43, 8 |
| ; CHECK-NEXT: v_readlane_b32 s39, v43, 7 |
| ; CHECK-NEXT: v_readlane_b32 s38, v43, 6 |
| ; CHECK-NEXT: v_readlane_b32 s37, v43, 5 |
| ; CHECK-NEXT: v_readlane_b32 s36, v43, 4 |
| ; CHECK-NEXT: v_readlane_b32 s35, v43, 3 |
| ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 |
| ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 |
| ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 |
| ; CHECK-NEXT: s_mov_b32 s32, s33 |
| ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_mov_b32 s33, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| %y = or i32 %y.arg, 1 |
| %call = tail call fast double @_Z4powndi(double %x, i32 %y) |
| ret double %call |
| } |
| |
| !llvm.module.flags = !{!0} |
| !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} |