| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1251,GFX1251-SDAG %s |
| ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1251,GFX1251-GISEL %s |
| |
| define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { |
| ; GFX1251-LABEL: fadd_v2_vv: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[0:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, %load |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x double> %x) { |
| ; GFX1251-LABEL: fadd_v2_vs: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, %x |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_ss(ptr addrspace(1) %a, <2 x double> %x, <2 x double> %y) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_ss: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_ss: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %add = fadd <2 x double> %x, %y |
| store <2 x double> %add, ptr addrspace(1) %a, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fadd_v4_vs: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_lshlrev_b32 v16, 5, v0 :: v_dual_mov_b32 v8, s12 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v13, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v15, s11 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v11, s15 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[12:15] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[4:7], v[4:7], v[8:11] |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v16, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v4_vs: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v16, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[8:11] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[4:7], v[4:7], v[12:15] |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v16, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <4 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <4 x double>, ptr addrspace(1) %gep, align 16 |
| %add = fadd <4 x double> %load, %x |
| store <4 x double> %add, ptr addrspace(1) %gep, align 16 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fadd_v32_vs: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x4 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[68:83], s[4:5], 0x1a4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[52:67], s[4:5], 0x1e4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x124 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0x164 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_lshlrev_b32 v44, 8, v0 :: v_dual_mov_b32 v66, s72 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v67, s73 :: v_dual_mov_b32 v68, s74 |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v44, s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v44, s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v44, s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v44, s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v44, s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v44, s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v44, s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v44, s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v44, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v44, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v44, s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[46:49], v44, s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[50:53], v44, s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[54:57], v44, s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[58:61], v44, s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[62:65], v44, s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v69, s75 :: v_dual_mov_b32 v70, s68 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v71, s69 :: v_dual_mov_b32 v72, s70 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v73, s71 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[40:43], v[40:43], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s80 :: v_dual_mov_b32 v67, s81 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s82 :: v_dual_mov_b32 v69, s83 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[36:39], v[36:39], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s76 :: v_dual_mov_b32 v71, s77 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s78 :: v_dual_mov_b32 v73, s79 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[32:35], v[32:35], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s56 :: v_dual_mov_b32 v67, s57 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s58 :: v_dual_mov_b32 v69, s59 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[28:31], v[28:31], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s52 :: v_dual_mov_b32 v71, s53 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s54 :: v_dual_mov_b32 v73, s55 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[24:27], v[24:27], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s64 :: v_dual_mov_b32 v67, s65 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s66 :: v_dual_mov_b32 v69, s67 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[20:23], v[20:23], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s60 :: v_dual_mov_b32 v71, s61 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s62 :: v_dual_mov_b32 v73, s63 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[16:19], v[16:19], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s13 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s14 :: v_dual_mov_b32 v69, s15 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[12:15], v[12:15], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s10 :: v_dual_mov_b32 v73, s11 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[8:11], v[8:11], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s40 :: v_dual_mov_b32 v67, s41 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s42 :: v_dual_mov_b32 v69, s43 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[4:7], v[4:7], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s36 :: v_dual_mov_b32 v71, s37 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s38 :: v_dual_mov_b32 v73, s39 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s48 :: v_dual_mov_b32 v67, s49 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s50 :: v_dual_mov_b32 v69, s51 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[46:49], v[46:49], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s44 :: v_dual_mov_b32 v71, s45 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s46 :: v_dual_mov_b32 v73, s47 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[50:53], v[50:53], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s16 :: v_dual_mov_b32 v67, s17 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s18 :: v_dual_mov_b32 v69, s19 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[54:57], v[54:57], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s20 :: v_dual_mov_b32 v71, s21 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s22 :: v_dual_mov_b32 v73, s23 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[58:61], v[58:61], v[66:69] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[62:65], v[62:65], v[70:73] |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[54:57], s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[50:53], s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[46:49], s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[0:3], s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[58:61], s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[62:65], s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[4:7], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[8:11], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[12:15], s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[16:19], s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[20:23], s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[24:27], s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[28:31], s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[32:35], s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[36:39], s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[40:43], s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v32_vs: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0x124 nv |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[44:59], s[4:5], 0x164 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v72, 8, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v72, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v72, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v72, s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v72, s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v72, s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v72, s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v72, s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v72, s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v72, s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v72, s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v72, s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v72, s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v72, s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v72, s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[56:59], v72, s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[60:63], v72, s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x1a4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[26:27] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[4:7], v[4:7], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[30:31] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[24:25] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[28:29] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[8:11], v[8:11], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[12:15], v[12:15], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0x1e4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[16:19], v[16:19], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[54:55] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[20:23], v[20:23], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[58:59] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[52:53] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[56:57] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[24:27], v[24:27], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[10:11] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[28:31], v[28:31], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[14:15] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[12:13] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[32:35], v[32:35], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[36:39], v[36:39], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[40:43], v[40:43], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[38:39] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[44:47], v[44:47], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[42:43] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[36:37] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[40:41] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[48:51], v[48:51], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[52:55], v[52:55], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[56:59], v[56:59], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[60:63], v[60:63], v[68:71] |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[4:7], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[8:11], s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[12:15], s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[16:19], s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[20:23], s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[24:27], s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[28:31], s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[32:35], s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[36:39], s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[40:43], s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[44:47], s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[48:51], s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[52:55], s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[56:59], s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[60:63], s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <32 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <32 x double>, ptr addrspace(1) %gep, align 128 |
| %add = fadd <32 x double> %load, %x |
| store <32 x double> %add, ptr addrspace(1) %gep, align 128 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_imm: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40590000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_imm: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x4059000000000000 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, <double 100.0, double 100.0> |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_v_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[2:5], v[4:7], v[0:3] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_v_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[2:5], v[4:7], v[0:3] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %id.1 = zext i32 %id to i64 |
| %fid = bitcast i64 %id.1 to double |
| %tmp1 = insertelement <2 x double> poison, double %fid, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fid, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| ; TODO: splat literal can be folded, but it is a REG_SEQUENCE which we do not match |
| |
| define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_lit_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_lit_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1.0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, <double 1.0, double 1.0> |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_lit_hi0: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v4 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_lit_hi0: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1.0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, <double 1.0, double 0.0> |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_lit_lo0: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_lit_lo0: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 1.0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, <double 0.0, double 1.0> |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_unfoldable_lit: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, 2.0 :: v_dual_mov_b32 v6, v4 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_unfoldable_lit: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 2.0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 1.0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %add = fadd <2 x double> %load, <double 1.0, double 2.0> |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| ; TODO: fneg can be folded |
| |
| define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, double %x) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_fneg: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_fneg: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[4:5], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %fneg, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fneg, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, double %x) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_fneg_lo: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s4, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_fneg_lo: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[4:5], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %fneg, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %x, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, double %x) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_fneg_hi: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s4, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s4 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_fneg_hi: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[6:7], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %x, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fneg, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, double %x, double %y) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_fneg_lo2: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_fneg_lo2: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[4:5], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %fneg, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %y, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, double %x, double %y) { |
| ; GFX1251-SDAG-LABEL: fadd_v2_v_fneg_hi2: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fadd_v2_v_fneg_hi2: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[6:7], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_add_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %y, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fneg, i64 1 |
| %add = fadd <2 x double> %load, %k |
| store <2 x double> %add, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { |
| ; GFX1251-LABEL: fmul_v2_vv: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[0:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %mul = fmul <2 x double> %load, %load |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x double> %x) { |
| ; GFX1251-LABEL: fmul_v2_vs: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %mul = fmul <2 x double> %load, %x |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_ss(ptr addrspace(1) %a, <2 x double> %x, <2 x double> %y) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_ss: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_ss: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %mul = fmul <2 x double> %x, %y |
| store <2 x double> %mul, ptr addrspace(1) %a, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fmul_v4_vs: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_lshlrev_b32 v16, 5, v0 :: v_dual_mov_b32 v8, s12 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v16, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v13, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v15, s11 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v11, s15 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[12:15] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[4:7], v[4:7], v[8:11] |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v16, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v4_vs: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v16, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[8:11] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[4:7], v[4:7], v[12:15] |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v16, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <4 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <4 x double>, ptr addrspace(1) %gep, align 16 |
| %mul = fmul <4 x double> %load, %x |
| store <4 x double> %mul, ptr addrspace(1) %gep, align 16 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fmul_v32_vs: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x4 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[68:83], s[4:5], 0x1a4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[52:67], s[4:5], 0x1e4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x124 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0x164 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_lshlrev_b32 v44, 8, v0 :: v_dual_mov_b32 v66, s72 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v67, s73 :: v_dual_mov_b32 v68, s74 |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v44, s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v44, s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v44, s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v44, s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v44, s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v44, s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v44, s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v44, s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v44, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v44, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v44, s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[46:49], v44, s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[50:53], v44, s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[54:57], v44, s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[58:61], v44, s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[62:65], v44, s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v69, s75 :: v_dual_mov_b32 v70, s68 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v71, s69 :: v_dual_mov_b32 v72, s70 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v73, s71 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[40:43], v[40:43], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s80 :: v_dual_mov_b32 v67, s81 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s82 :: v_dual_mov_b32 v69, s83 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[36:39], v[36:39], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s76 :: v_dual_mov_b32 v71, s77 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s78 :: v_dual_mov_b32 v73, s79 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[32:35], v[32:35], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s56 :: v_dual_mov_b32 v67, s57 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s58 :: v_dual_mov_b32 v69, s59 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[28:31], v[28:31], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s52 :: v_dual_mov_b32 v71, s53 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s54 :: v_dual_mov_b32 v73, s55 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[24:27], v[24:27], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s64 :: v_dual_mov_b32 v67, s65 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s66 :: v_dual_mov_b32 v69, s67 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[20:23], v[20:23], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s60 :: v_dual_mov_b32 v71, s61 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s62 :: v_dual_mov_b32 v73, s63 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[16:19], v[16:19], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s13 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s14 :: v_dual_mov_b32 v69, s15 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[12:15], v[12:15], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s10 :: v_dual_mov_b32 v73, s11 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[8:11], v[8:11], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s40 :: v_dual_mov_b32 v67, s41 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s42 :: v_dual_mov_b32 v69, s43 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[4:7], v[4:7], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s36 :: v_dual_mov_b32 v71, s37 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s38 :: v_dual_mov_b32 v73, s39 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s48 :: v_dual_mov_b32 v67, s49 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s50 :: v_dual_mov_b32 v69, s51 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[46:49], v[46:49], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s44 :: v_dual_mov_b32 v71, s45 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s46 :: v_dual_mov_b32 v73, s47 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[50:53], v[50:53], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v66, s16 :: v_dual_mov_b32 v67, s17 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v68, s18 :: v_dual_mov_b32 v69, s19 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[54:57], v[54:57], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v70, s20 :: v_dual_mov_b32 v71, s21 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v72, s22 :: v_dual_mov_b32 v73, s23 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[58:61], v[58:61], v[66:69] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[62:65], v[62:65], v[70:73] |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[54:57], s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[50:53], s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[46:49], s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[0:3], s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[58:61], s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[62:65], s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[4:7], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[8:11], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[12:15], s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[16:19], s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[20:23], s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[24:27], s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[28:31], s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[32:35], s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[36:39], s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[40:43], s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v32_vs: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0x124 nv |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[44:59], s[4:5], 0x164 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v72, 8, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v72, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v72, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v72, s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v72, s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v72, s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v72, s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v72, s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v72, s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v72, s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v72, s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v72, s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v72, s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v72, s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v72, s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[56:59], v72, s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[60:63], v72, s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x1a4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[26:27] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[4:7], v[4:7], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[30:31] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[24:25] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[28:29] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[8:11], v[8:11], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[12:15], v[12:15], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0x1e4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[16:19], v[16:19], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[54:55] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[20:23], v[20:23], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[58:59] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[52:53] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[56:57] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[24:27], v[24:27], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[10:11] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[28:31], v[28:31], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[14:15] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[12:13] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[32:35], v[32:35], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[36:39], v[36:39], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[40:43], v[40:43], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[38:39] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[44:47], v[44:47], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[42:43] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[36:37] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[40:41] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[48:51], v[48:51], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[52:55], v[52:55], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[56:59], v[56:59], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[60:63], v[60:63], v[68:71] |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[4:7], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[8:11], s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[12:15], s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[16:19], s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[20:23], s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[24:27], s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[28:31], s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[32:35], s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[36:39], s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[40:43], s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[44:47], s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[48:51], s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[52:55], s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[56:59], s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[60:63], s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <32 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <32 x double>, ptr addrspace(1) %gep, align 128 |
| %mul = fmul <32 x double> %load, %x |
| store <32 x double> %mul, ptr addrspace(1) %gep, align 128 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_v_imm: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40590000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_v_imm: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x4059000000000000 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %mul = fmul <2 x double> %load, <double 100.0, double 100.0> |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_v_v_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[2:5], v[4:7], v[0:3] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_v_v_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[2:5], v[4:7], v[0:3] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %id.1 = zext i32 %id to i64 |
| %fid = bitcast i64 %id.1 to double |
| %tmp1 = insertelement <2 x double> poison, double %fid, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fid, i64 1 |
| %mul = fmul <2 x double> %load, %k |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_v_lit_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40100000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_v_lit_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 4.0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %mul = fmul <2 x double> %load, <double 4.0, double 4.0> |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_v_unfoldable_lit: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40100000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, 0x40080000 :: v_dual_mov_b32 v6, v4 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_v_unfoldable_lit: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 0x4008000000000000 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 4.0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %mul = fmul <2 x double> %load, <double 4.0, double 3.0> |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, double %x) { |
| ; GFX1251-SDAG-LABEL: fmul_v2_v_fneg: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fmul_v2_v_fneg: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[4:5], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_mul_f64 v[0:3], v[0:3], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %fneg, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fneg, i64 1 |
| %mul = fmul <2 x double> %load, %k |
| store <2 x double> %mul, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { |
| ; GFX1251-LABEL: fma_v2_vv: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[0:3], v[0:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> %load, <2 x double> %load) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x double> %x) { |
| ; GFX1251-LABEL: fma_v2_vs: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset |
| ; GFX1251-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[4:7] |
| ; GFX1251-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> %x, <2 x double> %x) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_ss(ptr addrspace(1) %a, <2 x double> %x, <2 x double> %y, <2 x double> %z) { |
| ; GFX1251-SDAG-LABEL: fma_v2_ss: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x2 |
| ; GFX1251-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_ss: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x2 |
| ; GFX1251-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) |
| store <2 x double> %fma, ptr addrspace(1) %a, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x double> %x) { |
| ; GFX1251-LABEL: fma_v4_vs: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 nv |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v16, s[0:1] |
| ; GFX1251-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:16 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[8:9], s[8:9] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[10:11], s[10:11] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[12:13], s[12:13] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[14:15], s[14:15] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[8:11], v[8:11] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_pk_fma_f64 v[4:7], v[4:7], v[12:15], v[12:15] |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: global_store_b128 v16, v[0:3], s[0:1] |
| ; GFX1251-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 |
| ; GFX1251-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <4 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <4 x double>, ptr addrspace(1) %gep, align 16 |
| %fma = tail call <4 x double> @llvm.fma.v4f32(<4 x double> %load, <4 x double> %x, <4 x double> %x) |
| store <4 x double> %fma, ptr addrspace(1) %gep, align 16 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fma_v32_vs: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x2 |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0x1a4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[52:67], s[4:5], 0x1e4 nv |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0x164 nv |
| ; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v44, 8, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v44, s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v44, s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v44, s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v44, s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v44, s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v44, s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v44, s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v44, s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v44, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v44, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v44, s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[46:49], v44, s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[50:53], v44, s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[54:57], v44, s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[58:61], v44, s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[62:65], v44, s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[22:23] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[20:21] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[18:19] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[16:17] |
| ; GFX1251-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x124 nv |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[40:43], v[40:43], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[30:31] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[28:29] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[36:39], v[36:39], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[26:27] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[24:25] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[32:35], v[32:35], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[58:59] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[56:57] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[28:31], v[28:31], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[54:55] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[52:53] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[24:27], v[24:27], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[66:67] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[64:65] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[20:23], v[20:23], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[62:63] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[60:61] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[16:19], v[16:19], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[14:15] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[12:13] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[12:15], v[12:15], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[10:11] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[8:9] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[8:11], v[8:11], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[42:43] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[40:41] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[38:39] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[36:37] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[4:7], v[4:7], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[50:51] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[48:49] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[46:49], v[46:49], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[46:47] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[44:45] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[50:53], v[50:53], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[68:69], s[18:19] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[66:67], s[16:17] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[54:57], v[54:57], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[72:73], s[22:23] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[70:71], s[20:21] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[58:61], v[58:61], v[66:69], v[66:69] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[62:65], v[62:65], v[70:73], v[70:73] |
| ; GFX1251-SDAG-NEXT: s_clause 0xf |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[54:57], s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[50:53], s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[46:49], s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[4:7], s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[58:61], s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[62:65], s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[8:11], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[12:15], s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[16:19], s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[20:23], s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[24:27], s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[28:31], s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[32:35], s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[36:39], s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v44, v[40:43], s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v32_vs: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0x124 nv |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[44:59], s[4:5], 0x164 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v72, 8, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v72, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v72, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v72, s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v72, s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v72, s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v72, s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v72, s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v72, s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v72, s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v72, s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v72, s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v72, s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v72, s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v72, s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[56:59], v72, s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[60:63], v72, s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x1a4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xf |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[26:27] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[4:7], v[4:7], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[30:31] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[24:25] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[28:29] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[8:11], v[8:11], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[12:15], v[12:15], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0x1e4 nv |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[16:19], v[16:19], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[54:55] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[20:23], v[20:23], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[58:59] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[52:53] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[56:57] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[24:27], v[24:27], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[10:11] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[28:31], v[28:31], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[14:15] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[8:9] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[12:13] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[32:35], v[32:35], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[18:19] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[36:39], v[36:39], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[22:23] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[16:17] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[20:21] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[40:43], v[40:43], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[38:39] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[44:47], v[44:47], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[42:43] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[36:37] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[40:41] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[48:51], v[48:51], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[66:67], s[46:47] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[52:55], v[52:55], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[70:71], s[50:51] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[64:65], s[44:45] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[68:69], s[48:49] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[56:59], v[56:59], v[64:67], v[64:67] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[60:63], v[60:63], v[68:71], v[68:71] |
| ; GFX1251-GISEL-NEXT: s_clause 0xf |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[4:7], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[8:11], s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[12:15], s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[16:19], s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[20:23], s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[24:27], s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[28:31], s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[32:35], s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[36:39], s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[40:43], s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[44:47], s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[48:51], s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[52:55], s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[56:59], s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v72, v[60:63], s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <32 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <32 x double>, ptr addrspace(1) %gep, align 128 |
| %fma = tail call <32 x double> @llvm.fma.v32f32(<32 x double> %load, <32 x double> %x, <32 x double> %x) |
| store <32 x double> %fma, ptr addrspace(1) %gep, align 128 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fma_v2_v_imm: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40690000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, 0x40590000 :: v_dual_mov_b32 v6, v4 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v4 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, v4 :: v_dual_mov_b32 v11, v9 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[8:11], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_v_imm: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[8:9], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 0x4059000000000000 |
| ; GFX1251-GISEL-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[4:5], 0x4069000000000000 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[6:7] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v12, s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v12, v[0:3], s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> <double 100.0, double 100.0>, <2 x double> <double 200.0, double 200.0>) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fma_v2_v_v_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, v0 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[2:5], v[4:7], v[0:3], v[0:3] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_v_v_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v0, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[2:5], v[4:7], v[0:3], v[0:3] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %id.1 = zext i32 %id to i64 |
| %fid = bitcast i64 %id.1 to double |
| %tmp1 = insertelement <2 x double> poison, double %fid, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fid, i64 1 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> %k, <2 x double> %k) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fma_v2_v_lit_splat: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v9, 0x40100000 :: v_dual_mov_b32 v6, v4 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v8, v4 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v10, v4 :: v_dual_mov_b32 v11, v9 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[8:11], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_v_lit_splat: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[8:9], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 4.0 |
| ; GFX1251-GISEL-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[4:5], 1.0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[6:7] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v12, s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v12, v[0:3], s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> <double 4.0, double 4.0>, <2 x double> <double 1.0, double 1.0>) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fma_v2_v_unfoldable_lit: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x3ff00000 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v7, 2.0 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v9, 0x40100000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v11, 0x40080000 :: v_dual_mov_b32 v10, v4 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v8, v4 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v12, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[8:11], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_v_unfoldable_lit: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[8:9], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v12, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[2:3], 0x4008000000000000 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[6:7], 2.0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[0:1], 4.0 |
| ; GFX1251-GISEL-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_mov_b64 s[4:5], 1.0 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[6:7] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v12, s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v12, v[0:3], s[8:9] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> <double 4.0, double 3.0>, <2 x double> <double 1.0, double 2.0>) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, double %x) { |
| ; GFX1251-SDAG-LABEL: fma_v2_v_fneg: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_mov_b32 s4, s2 |
| ; GFX1251-SDAG-NEXT: s_mov_b32 s6, s2 |
| ; GFX1251-SDAG-NEXT: s_mov_b32 s7, s5 |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] |
| ; GFX1251-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[4:7] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_v2_v_fneg: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: v_max_num_f64_e64 v[4:5], -s[2:3], -s[2:3] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[4:7] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub double -0.0, %x |
| %tmp1 = insertelement <2 x double> poison, double %fneg, i64 0 |
| %k = insertelement <2 x double> %tmp1, double %fneg, i64 1 |
| %fma = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %load, <2 x double> %k, <2 x double> %k) |
| store <2 x double> %fma, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { |
| ; GFX1251-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: |
| ; GFX1251-SDAG: ; %bb.0: ; %bb |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v10, s3 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[2:3], v4 offset0:2 offset1:3 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[0:1], v4 offset1:1 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[6:7], v4 offset0:6 offset1:7 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[4:5], v4 offset0:4 offset1:5 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[8:9], v10 offset1:1 |
| ; GFX1251-SDAG-NEXT: ds_load_2addr_b32 v[10:11], v10 offset0:4 offset1:5 |
| ; GFX1251-SDAG-NEXT: s_wait_dscnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: |
| ; GFX1251-GISEL: ; %bb.0: ; %bb |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v10, s3 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[0:1], v6 offset1:1 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[2:3], v6 offset0:2 offset1:3 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[4:5], v6 offset0:4 offset1:5 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[6:7], v6 offset0:6 offset1:7 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[8:9], v10 offset1:1 |
| ; GFX1251-GISEL-NEXT: ds_load_2addr_b32 v[10:11], v10 offset0:4 offset1:5 |
| ; GFX1251-GISEL-NEXT: s_wait_dscnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[8:11], v[8:11], v[8:11] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[0:3], v[0:3], v[4:7], v[8:11] |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| bb: |
| %lds.gep1 = getelementptr inbounds <2 x double>, ptr addrspace(3) %lds, i32 1 |
| %arg2.gep = getelementptr inbounds double, ptr addrspace(3) %arg2, i32 2 |
| |
| %vec0 = load volatile <2 x double>, ptr addrspace(3) %lds, align 4 |
| %vec1 = load volatile <2 x double>, ptr addrspace(3) %lds.gep1, align 4 |
| |
| %scalar0 = load volatile double, ptr addrspace(3) %arg2, align 4 |
| %scalar1 = load volatile double, ptr addrspace(3) %arg2.gep, align 4 |
| |
| %vec.ins0 = insertelement <2 x double> poison, double %scalar0, i32 0 |
| %vec2 = insertelement <2 x double> %vec.ins0, double %scalar1, i32 1 |
| %neg.vec2 = fsub <2 x double> <double -0.0, double -0.0>, %vec2 |
| |
| %result = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %vec0, <2 x double> %vec1, <2 x double> %neg.vec2) |
| store <2 x double> %result, ptr addrspace(1) %out, align 4 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fneg_v2f64_vec(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fneg_v2f64_vec: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fneg_v2f64_vec: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[0:3], v[0:3], v[0:3] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <2 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <2 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub <2 x double> <double -0.0, double -0.0>, %load |
| store <2 x double> %fneg, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fneg_v2f64_scalar(ptr addrspace(1) %a, <2 x double> %x) { |
| ; GFX1251-SDAG-LABEL: fneg_v2f64_scalar: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_clause 0x1 |
| ; GFX1251-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0 |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_xor_b32 s1, s1, 0x80000000 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fneg_v2f64_scalar: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_clause 0x1 |
| ; GFX1251-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_xor_b32 s1, s1, 0x80000000 |
| ; GFX1251-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[0:3], v[0:3], v[0:3] |
| ; GFX1251-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %fneg = fsub <2 x double> <double -0.0, double -0.0>, %x |
| store <2 x double> %fneg, ptr addrspace(1) %a, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fneg_v32f64_vec(ptr addrspace(1) %a) { |
| ; GFX1251-SDAG-LABEL: fneg_v32f64_vec: |
| ; GFX1251-SDAG: ; %bb.0: |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_lshlrev_b32_e32 v56, 8, v0 |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: s_clause 0xd |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[32:35], v56, s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[36:39], v56, s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[40:43], v56, s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[44:47], v56, s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[48:51], v56, s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[52:55], v56, s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[4:7], v[4:7], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[8:11], v[8:11], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[12:15], v[12:15], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[16:19], v[16:19], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[20:23], v[20:23], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_clause 0x3 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112 |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] |
| ; GFX1251-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[40:43], v[40:43], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[44:47], v[44:47], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[48:51], v[48:51], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[52:55], v[52:55], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[28:31], v[28:31], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[24:27], v[24:27], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[36:39], v[36:39], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[32:35], v[32:35], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[0:3], v[0:3], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-SDAG-NEXT: v_pk_add_f64 v[4:7], v[4:7], 0 neg_lo:[1,1] neg_hi:[1,1] |
| ; GFX1251-SDAG-NEXT: s_clause 0xd |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:64 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:80 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:32 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:48 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[32:35], s[0:1] offset:192 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:208 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144 |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fneg_v32f64_vec: |
| ; GFX1251-GISEL: ; %bb.0: |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_lshlrev_b32_e32 v56, 8, v0 |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: s_clause 0xd |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v56, s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v56, s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[32:35], v56, s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[44:47], v56, s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[48:51], v56, s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[52:55], v56, s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xd |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v41, 0x80000000, v41 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v43, 0x80000000, v43 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v37, 0x80000000, v37 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v39, 0x80000000, v39 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[40:43], v[40:43], v[40:43] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[36:39], v[36:39], v[36:39] |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 |
| ; GFX1251-GISEL-NEXT: s_clause 0x3 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[40:43], s[0:1] |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[40:43], v56, s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16 |
| ; GFX1251-GISEL-NEXT: global_load_b128 v[36:39], v56, s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v19, 0x80000000, v19 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v21, 0x80000000, v21 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v23, 0x80000000, v23 |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[0:3], v[0:3], v[0:3] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x7 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v25, 0x80000000, v25 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v27, 0x80000000, v27 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v29, 0x80000000, v29 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v31, 0x80000000, v31 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v33, 0x80000000, v33 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v35, 0x80000000, v35 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v45, 0x80000000, v45 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v47, 0x80000000, v47 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v49, 0x80000000, v49 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v51, 0x80000000, v51 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v53, 0x80000000, v53 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v55, 0x80000000, v55 |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[4:7], v[4:7], v[4:7] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[8:11], v[8:11], v[8:11] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[12:15], v[12:15], v[12:15] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[16:19], v[16:19], v[16:19] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[20:23], v[20:23], v[20:23] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[24:27], v[24:27], v[24:27] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[28:31], v[28:31], v[28:31] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[32:35], v[32:35], v[32:35] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[44:47], v[44:47], v[44:47] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[48:51], v[48:51], v[48:51] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[52:55], v[52:55], v[52:55] |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v41, 0x80000000, v41 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v43, 0x80000000, v43 |
| ; GFX1251-GISEL-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v37, 0x80000000, v37 |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v39, 0x80000000, v39 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[40:43], v[40:43], v[40:43] |
| ; GFX1251-GISEL-NEXT: v_pk_max_num_f64 v[36:39], v[36:39], v[36:39] |
| ; GFX1251-GISEL-NEXT: s_clause 0xd |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:32 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:64 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:80 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:96 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:112 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:128 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:144 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[32:35], s[0:1] offset:160 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:176 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:192 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:208 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:224 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:240 |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %gep = getelementptr inbounds <32 x double>, ptr addrspace(1) %a, i32 %id |
| %load = load <32 x double>, ptr addrspace(1) %gep, align 8 |
| %fneg = fsub <32 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %load |
| store <32 x double> %fneg, ptr addrspace(1) %gep, align 8 |
| ret void |
| } |
| |
| define amdgpu_kernel void @fneg_v2f64_pkfma(ptr addrspace(1) %out) { |
| ; GFX1251-SDAG-LABEL: fneg_v2f64_pkfma: |
| ; GFX1251-SDAG: ; %bb.0: ; %entry |
| ; GFX1251-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-SDAG-NEXT: s_mov_b32 s2, 0 |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX1251-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2 |
| ; GFX1251-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX1251-SDAG-NEXT: v_cndmask_b32_e64 v1, 0x3ff00000, 0, vcc_lo |
| ; GFX1251-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX1251-SDAG-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX1251-SDAG-NEXT: v_pk_fma_f64 v[0:3], v[0:3], 0, v[0:3] neg_lo:[0,0,1] neg_hi:[0,0,1] |
| ; GFX1251-SDAG-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-SDAG-NEXT: s_endpgm |
| ; |
| ; GFX1251-GISEL-LABEL: fneg_v2f64_pkfma: |
| ; GFX1251-GISEL: ; %bb.0: ; %entry |
| ; GFX1251-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX1251-GISEL-NEXT: v_cndmask_b32_e64 v1, 0x3ff00000, 0, vcc_lo |
| ; GFX1251-GISEL-NEXT: v_mov_b32_e32 v4, v0 |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX1251-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v1 |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[0:1] |
| ; GFX1251-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5] |
| ; GFX1251-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-GISEL-NEXT: v_pk_fma_f64 v[2:5], v[0:3], 0, v[4:7] |
| ; GFX1251-GISEL-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-GISEL-NEXT: global_store_b128 v0, v[2:5], s[0:1] |
| ; GFX1251-GISEL-NEXT: s_endpgm |
| entry: |
| %tid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| %cmp = icmp eq i32 %tid, 0 |
| %v = select i1 %cmp, <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) |
| %nv = fneg <2 x double> %v |
| %r = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v, <2 x double> zeroinitializer, <2 x double> %nv) |
| store <2 x double> %r, ptr addrspace(1) %out, align 16 |
| ret void |
| } |