| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX1250 %s |
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1251 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX1251 %s |
| |
| define amdgpu_kernel void @s_shl_v2i64(ptr addrspace(1) %out, <2 x i64> %lhs, <2 x i64> %rhs) #0 { |
| ; GFX1250-LABEL: s_shl_v2i64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_clause 0x1 |
| ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: s_lshl_b64 s[2:3], s[8:9], s12 |
| ; GFX1250-NEXT: s_lshl_b64 s[4:5], s[10:11], s14 |
| ; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s2 |
| ; GFX1250-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4 |
| ; GFX1250-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: s_shl_v2i64: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[8:9], s12 |
| ; GFX1251-NEXT: s_lshl_b64 s[4:5], s[10:11], s14 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[2:3] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[4:5] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] |
| ; GFX1251-NEXT: s_endpgm |
| %result = shl <2 x i64> %lhs, %rhs |
| store <2 x i64> %result, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @s_shl_v2i64_s_imm(ptr addrspace(1) %out, <2 x i64> %lhs) #0 { |
| ; GFX1250-LABEL: s_shl_v2i64_s_imm: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_clause 0x1 |
| ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 |
| ; GFX1250-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 |
| ; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0 |
| ; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 |
| ; GFX1250-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: s_shl_v2i64_s_imm: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[2:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-NEXT: s_endpgm |
| %result = shl <2 x i64> %lhs, <i64 1, i64 2> |
| store <2 x i64> %result, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @s_shl_v2i64_imm_s(ptr addrspace(1) %out, <2 x i64> %rhs) #0 { |
| ; GFX1250-LABEL: s_shl_v2i64_imm_s: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_clause 0x1 |
| ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: s_lshl_b64 s[0:1], 0x4d2, s0 |
| ; GFX1250-NEXT: s_lshl_b64 s[2:3], 0x162e, s2 |
| ; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0 |
| ; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 |
| ; GFX1250-NEXT: v_mov_b32_e32 v3, s3 |
| ; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: s_shl_v2i64_imm_s: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv |
| ; GFX1251-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: s_mov_b64 s[8:9], 0x4d2 |
| ; GFX1251-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-NEXT: s_movk_i32 s4, 0x162e |
| ; GFX1251-NEXT: s_mov_b32 s5, s9 |
| ; GFX1251-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_lshl_b64 s[0:1], s[8:9], s0 |
| ; GFX1251-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 |
| ; GFX1251-NEXT: v_mov_b64_e32 v[0:1], s[0:1] |
| ; GFX1251-NEXT: v_mov_b64_e32 v[2:3], s[2:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[6:7] |
| ; GFX1251-NEXT: s_endpgm |
| %result = shl <2 x i64> <i64 1234, i64 5678>, %rhs |
| store <2 x i64> %result, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GCN-LABEL: v_shl_v2i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v8, 0x3ff, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: s_clause 0x1 |
| ; GCN-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 scale_offset |
| ; GCN-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5] |
| ; GCN-NEXT: global_store_b128 v8, v[0:3], s[0:1] scale_offset |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in.gep, i32 1 |
| %a = load <2 x i64>, ptr addrspace(1) %in.gep |
| %b = load <2 x i64>, ptr addrspace(1) %b_ptr |
| %result = shl <2 x i64> %a, %b |
| store <2 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @shl_v_s_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i64> %sgpr) #0 { |
| ; GCN-LABEL: shl_v_s_v2i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: global_load_b128 v[0:3], v4, s[10:11] scale_offset |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], s14, v[2:3] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], s12, v[0:1] |
| ; GCN-NEXT: global_store_b128 v4, v[0:3], s[8:9] scale_offset |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep |
| %result = shl <2 x i64> %vgpr, %sgpr |
| store <2 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @shl_s_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i64> %sgpr) #0 { |
| ; GCN-LABEL: shl_s_v_v2i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: global_load_b128 v[0:3], v4, s[10:11] scale_offset |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e64 v[2:3], v2, s[14:15] |
| ; GCN-NEXT: v_lshlrev_b64_e64 v[0:1], v0, s[12:13] |
| ; GCN-NEXT: global_store_b128 v4, v[0:3], s[8:9] scale_offset |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep |
| %result = shl <2 x i64> %sgpr, %vgpr |
| store <2 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @shl_imm_v_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GFX1250-LABEL: shl_imm_v_v2i64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset |
| ; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1250-NEXT: v_lshlrev_b64_e64 v[2:3], v2, 8 |
| ; GFX1250-NEXT: v_lshlrev_b64_e64 v[0:1], v0, 8 |
| ; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: shl_imm_v_v2i64: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset |
| ; GFX1251-NEXT: s_wait_xcnt 0x0 |
| ; GFX1251-NEXT: s_mov_b64 s[2:3], 8 |
| ; GFX1251-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX1251-NEXT: s_mov_b32 s4, s2 |
| ; GFX1251-NEXT: s_mov_b32 s5, s3 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_lshlrev_b64_e64 v[2:3], v2, s[4:5] |
| ; GFX1251-NEXT: v_lshlrev_b64_e64 v[0:1], v0, s[2:3] |
| ; GFX1251-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GFX1251-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep |
| %result = shl <2 x i64> <i64 8, i64 8>, %vgpr |
| store <2 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @shl_v_imm_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GCN-LABEL: shl_v_imm_v2i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x3ff, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], 8, v[2:3] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], 8, v[0:1] |
| ; GCN-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %vgpr = load <2 x i64>, ptr addrspace(1) %in.gep |
| %result = shl <2 x i64> %vgpr, <i64 8, i64 8> |
| store <2 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GCN-LABEL: v_shl_v4i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: s_clause 0x3 |
| ; GCN-NEXT: global_load_b128 v[0:3], v16, s[2:3] offset:48 |
| ; GCN-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:32 |
| ; GCN-NEXT: global_load_b128 v[8:11], v16, s[2:3] |
| ; GCN-NEXT: global_load_b128 v[12:15], v16, s[2:3] offset:16 |
| ; GCN-NEXT: s_wait_loadcnt 0x1 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[10:11] |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[14:15] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[12:13] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[8:9] |
| ; GCN-NEXT: s_clause 0x1 |
| ; GCN-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:16 |
| ; GCN-NEXT: global_store_b128 v16, v[4:7], s[0:1] |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in.gep, i32 1 |
| %a = load <4 x i64>, ptr addrspace(1) %in.gep |
| %b = load <4 x i64>, ptr addrspace(1) %b_ptr |
| %result = shl <4 x i64> %a, %b |
| store <4 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @shl_v_imm_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GCN-LABEL: shl_v_imm_v4i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v8, 5, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: s_clause 0x1 |
| ; GCN-NEXT: global_load_b128 v[0:3], v8, s[2:3] |
| ; GCN-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 |
| ; GCN-NEXT: s_wait_loadcnt 0x1 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], 8, v[2:3] |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], 8, v[6:7] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], 8, v[4:5] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], 8, v[0:1] |
| ; GCN-NEXT: s_clause 0x1 |
| ; GCN-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 |
| ; GCN-NEXT: global_store_b128 v8, v[0:3], s[0:1] |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <4 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %vgpr = load <4 x i64>, ptr addrspace(1) %in.gep |
| %result = shl <4 x i64> %vgpr, <i64 8, i64 8, i64 8, i64 8> |
| store <4 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_shl_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GCN-LABEL: v_shl_v8i64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_lshlrev_b32_e32 v32, 6, v0 |
| ; GCN-NEXT: s_wait_kmcnt 0x0 |
| ; GCN-NEXT: s_clause 0x7 |
| ; GCN-NEXT: global_load_b128 v[0:3], v32, s[2:3] |
| ; GCN-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16 |
| ; GCN-NEXT: global_load_b128 v[8:11], v32, s[2:3] offset:64 |
| ; GCN-NEXT: global_load_b128 v[12:15], v32, s[2:3] offset:80 |
| ; GCN-NEXT: global_load_b128 v[16:19], v32, s[2:3] offset:32 |
| ; GCN-NEXT: global_load_b128 v[20:23], v32, s[2:3] offset:48 |
| ; GCN-NEXT: global_load_b128 v[24:27], v32, s[2:3] offset:112 |
| ; GCN-NEXT: global_load_b128 v[28:31], v32, s[2:3] offset:96 |
| ; GCN-NEXT: s_wait_loadcnt 0x5 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1] |
| ; GCN-NEXT: s_wait_loadcnt 0x4 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[6:7] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[4:5] |
| ; GCN-NEXT: s_wait_loadcnt 0x1 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[22:23], v26, v[22:23] |
| ; GCN-NEXT: s_wait_loadcnt 0x0 |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[18:19], v30, v[18:19] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[16:17], v28, v[16:17] |
| ; GCN-NEXT: v_lshlrev_b64_e32 v[20:21], v24, v[20:21] |
| ; GCN-NEXT: s_clause 0x3 |
| ; GCN-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:32 |
| ; GCN-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48 |
| ; GCN-NEXT: global_store_b128 v32, v[0:3], s[0:1] |
| ; GCN-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 |
| ; GCN-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <8 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <8 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %b_ptr = getelementptr <8 x i64>, ptr addrspace(1) %in.gep, i32 1 |
| %a = load <8 x i64>, ptr addrspace(1) %in.gep |
| %b = load <8 x i64>, ptr addrspace(1) %b_ptr |
| %result = shl <8 x i64> %a, %b |
| store <8 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_shl_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GFX1250-LABEL: v_shl_v16i64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_lshlrev_b32_e32 v52, 7, v0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: s_clause 0xc |
| ; GFX1250-NEXT: global_load_b128 v[0:3], v52, s[2:3] offset:144 |
| ; GFX1250-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:16 |
| ; GFX1250-NEXT: global_load_b128 v[8:11], v52, s[2:3] |
| ; GFX1250-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:128 |
| ; GFX1250-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:64 |
| ; GFX1250-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:192 |
| ; GFX1250-NEXT: global_load_b128 v[24:27], v52, s[2:3] offset:96 |
| ; GFX1250-NEXT: global_load_b128 v[28:31], v52, s[2:3] offset:112 |
| ; GFX1250-NEXT: global_load_b128 v[32:35], v52, s[2:3] offset:80 |
| ; GFX1250-NEXT: global_load_b128 v[36:39], v52, s[2:3] offset:32 |
| ; GFX1250-NEXT: global_load_b128 v[40:43], v52, s[2:3] offset:48 |
| ; GFX1250-NEXT: global_load_b128 v[44:47], v52, s[2:3] offset:224 |
| ; GFX1250-NEXT: global_load_b128 v[48:51], v52, s[2:3] offset:240 |
| ; GFX1250-NEXT: s_wait_loadcnt 0xb |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5] |
| ; GFX1250-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:208 |
| ; GFX1250-NEXT: s_wait_loadcnt 0xa |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[10:11], v14, v[10:11] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[8:9], v12, v[8:9] |
| ; GFX1250-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:176 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x9 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[18:19], v22, v[18:19] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[16:17], v20, v[16:17] |
| ; GFX1250-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:160 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[26:27], v46, v[26:27] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[24:25], v44, v[24:25] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[30:31], v50, v[30:31] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[28:29], v48, v[28:29] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[34:35] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[32:33] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[14:15], v14, v[42:43] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[12:13], v12, v[40:41] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[22:23], v22, v[38:39] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[20:21], v20, v[36:37] |
| ; GFX1250-NEXT: s_clause 0x7 |
| ; GFX1250-NEXT: global_store_b128 v52, v[24:27], s[0:1] offset:96 |
| ; GFX1250-NEXT: global_store_b128 v52, v[28:31], s[0:1] offset:112 |
| ; GFX1250-NEXT: global_store_b128 v52, v[16:19], s[0:1] offset:64 |
| ; GFX1250-NEXT: global_store_b128 v52, v[4:7], s[0:1] offset:80 |
| ; GFX1250-NEXT: global_store_b128 v52, v[20:23], s[0:1] offset:32 |
| ; GFX1250-NEXT: global_store_b128 v52, v[12:15], s[0:1] offset:48 |
| ; GFX1250-NEXT: global_store_b128 v52, v[8:11], s[0:1] |
| ; GFX1250-NEXT: global_store_b128 v52, v[0:3], s[0:1] offset:16 |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: v_shl_v16i64: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_lshlrev_b32_e32 v52, 7, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_clause 0xb |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v52, s[2:3] offset:144 |
| ; GFX1251-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:16 |
| ; GFX1251-NEXT: global_load_b128 v[8:11], v52, s[2:3] |
| ; GFX1251-NEXT: global_load_b128 v[12:15], v52, s[2:3] offset:128 |
| ; GFX1251-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:176 |
| ; GFX1251-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:48 |
| ; GFX1251-NEXT: global_load_b128 v[24:27], v52, s[2:3] offset:32 |
| ; GFX1251-NEXT: global_load_b128 v[28:31], v52, s[2:3] offset:160 |
| ; GFX1251-NEXT: global_load_b128 v[32:35], v52, s[2:3] offset:96 |
| ; GFX1251-NEXT: global_load_b128 v[36:39], v52, s[2:3] offset:112 |
| ; GFX1251-NEXT: global_load_b128 v[40:43], v52, s[2:3] offset:64 |
| ; GFX1251-NEXT: global_load_b128 v[44:47], v52, s[2:3] offset:80 |
| ; GFX1251-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5] |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: global_load_b128 v[4:7], v52, s[2:3] offset:224 |
| ; GFX1251-NEXT: global_load_b128 v[48:51], v52, s[2:3] offset:240 |
| ; GFX1251-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[10:11], v14, v[10:11] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[8:9], v12, v[8:9] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[14:15], v18, v[22:23] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[12:13], v16, v[20:21] |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: global_load_b128 v[16:19], v52, s[2:3] offset:208 |
| ; GFX1251-NEXT: global_load_b128 v[20:23], v52, s[2:3] offset:192 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[24:25], v28, v[24:25] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x3 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[6:7], v6, v[34:35] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[4:5], v4, v[32:33] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[38:39] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[36:37], v48, v[36:37] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x1 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[18:19], v18, v[46:47] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[22:23], v22, v[42:43] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[20:21], v20, v[40:41] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[16:17], v16, v[44:45] |
| ; GFX1251-NEXT: s_clause 0x7 |
| ; GFX1251-NEXT: global_store_b128 v52, v[4:7], s[0:1] offset:96 |
| ; GFX1251-NEXT: global_store_b128 v52, v[36:39], s[0:1] offset:112 |
| ; GFX1251-NEXT: global_store_b128 v52, v[20:23], s[0:1] offset:64 |
| ; GFX1251-NEXT: global_store_b128 v52, v[16:19], s[0:1] offset:80 |
| ; GFX1251-NEXT: global_store_b128 v52, v[24:27], s[0:1] offset:32 |
| ; GFX1251-NEXT: global_store_b128 v52, v[12:15], s[0:1] offset:48 |
| ; GFX1251-NEXT: global_store_b128 v52, v[8:11], s[0:1] |
| ; GFX1251-NEXT: global_store_b128 v52, v[0:3], s[0:1] offset:16 |
| ; GFX1251-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <16 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <16 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %b_ptr = getelementptr <16 x i64>, ptr addrspace(1) %in.gep, i32 1 |
| %a = load <16 x i64>, ptr addrspace(1) %in.gep |
| %b = load <16 x i64>, ptr addrspace(1) %b_ptr |
| %result = shl <16 x i64> %a, %b |
| store <16 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| define amdgpu_kernel void @v_shl_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { |
| ; GFX1250-LABEL: v_shl_v32i64: |
| ; GFX1250: ; %bb.0: |
| ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 8, v0 |
| ; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1250-NEXT: s_clause 0x13 |
| ; GFX1250-NEXT: global_load_b128 v[0:3], v24, s[2:3] offset:400 |
| ; GFX1250-NEXT: global_load_b128 v[4:7], v24, s[2:3] offset:144 |
| ; GFX1250-NEXT: global_load_b128 v[8:11], v24, s[2:3] offset:128 |
| ; GFX1250-NEXT: global_load_b128 v[12:15], v24, s[2:3] offset:384 |
| ; GFX1250-NEXT: global_load_b128 v[16:19], v24, s[2:3] offset:432 |
| ; GFX1250-NEXT: global_load_b128 v[20:23], v24, s[2:3] offset:176 |
| ; GFX1250-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:160 |
| ; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:416 |
| ; GFX1250-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:464 |
| ; GFX1250-NEXT: global_load_b128 v[38:41], v24, s[2:3] offset:208 |
| ; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:192 |
| ; GFX1250-NEXT: global_load_b128 v[46:49], v24, s[2:3] offset:448 |
| ; GFX1250-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:496 |
| ; GFX1250-NEXT: global_load_b128 v[54:57], v24, s[2:3] offset:240 |
| ; GFX1250-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:224 |
| ; GFX1250-NEXT: global_load_b128 v[62:65], v24, s[2:3] offset:480 |
| ; GFX1250-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:16 |
| ; GFX1250-NEXT: global_load_b128 v[70:73], v24, s[2:3] |
| ; GFX1250-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:272 |
| ; GFX1250-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:256 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x12 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x10 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[10:11] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[8:9] |
| ; GFX1250-NEXT: s_wait_loadcnt 0xe |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[10:11], v18, v[22:23] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[8:9], v16, v[20:21] |
| ; GFX1250-NEXT: s_wait_loadcnt 0xc |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[14:15], v32, v[28:29] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[12:13], v30, v[26:27] |
| ; GFX1250-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:32 |
| ; GFX1250-NEXT: s_wait_loadcnt 0xb |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[18:19], v36, v[40:41] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[16:17], v34, v[38:39] |
| ; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:288 |
| ; GFX1250-NEXT: s_wait_loadcnt 0xa |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[22:23], v48, v[44:45] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[20:21], v46, v[42:43] |
| ; GFX1250-NEXT: s_clause 0x1 |
| ; GFX1250-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:320 |
| ; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:64 |
| ; GFX1250-NEXT: s_wait_loadcnt 0xa |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[40:41], v52, v[56:57] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[54:55] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[48:49], v64, v[60:61] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[46:47], v62, v[58:59] |
| ; GFX1250-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:80 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[56:57], v76, v[68:69] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[54:55], v74, v[66:67] |
| ; GFX1250-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:48 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[64:65], v80, v[72:73] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[62:63], v78, v[70:71] |
| ; GFX1250-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:304 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[28:29], v32, v[28:29] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27] |
| ; GFX1250-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:336 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[36:37], v36, v[44:45] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[34:35], v34, v[42:43] |
| ; GFX1250-NEXT: s_clause 0x3 |
| ; GFX1250-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:352 |
| ; GFX1250-NEXT: global_load_b128 v[70:73], v24, s[2:3] offset:96 |
| ; GFX1250-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:112 |
| ; GFX1250-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:368 |
| ; GFX1250-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[32:33], v32, v[52:53] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[30:31], v30, v[50:51] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[44:45], v44, v[72:73] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[42:43], v42, v[70:71] |
| ; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[72:73], v80, v[76:77] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[70:71], v78, v[74:75] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[52:53], v68, v[60:61] |
| ; GFX1250-NEXT: v_lshlrev_b64_e32 v[50:51], v66, v[58:59] |
| ; GFX1250-NEXT: s_clause 0xf |
| ; GFX1250-NEXT: global_store_b128 v24, v[42:45], s[0:1] offset:96 |
| ; GFX1250-NEXT: global_store_b128 v24, v[70:73], s[0:1] offset:112 |
| ; GFX1250-NEXT: global_store_b128 v24, v[34:37], s[0:1] offset:64 |
| ; GFX1250-NEXT: global_store_b128 v24, v[30:33], s[0:1] offset:80 |
| ; GFX1250-NEXT: global_store_b128 v24, v[26:29], s[0:1] offset:32 |
| ; GFX1250-NEXT: global_store_b128 v24, v[50:53], s[0:1] offset:48 |
| ; GFX1250-NEXT: global_store_b128 v24, v[62:65], s[0:1] |
| ; GFX1250-NEXT: global_store_b128 v24, v[54:57], s[0:1] offset:16 |
| ; GFX1250-NEXT: global_store_b128 v24, v[46:49], s[0:1] offset:224 |
| ; GFX1250-NEXT: global_store_b128 v24, v[38:41], s[0:1] offset:240 |
| ; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:192 |
| ; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:208 |
| ; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:160 |
| ; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:176 |
| ; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:128 |
| ; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:144 |
| ; GFX1250-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1250-NEXT: s_endpgm |
| ; |
| ; GFX1251-LABEL: v_shl_v32i64: |
| ; GFX1251: ; %bb.0: |
| ; GFX1251-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 |
| ; GFX1251-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv |
| ; GFX1251-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| ; GFX1251-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX1251-NEXT: v_lshlrev_b32_e32 v24, 8, v0 |
| ; GFX1251-NEXT: s_wait_kmcnt 0x0 |
| ; GFX1251-NEXT: s_clause 0x13 |
| ; GFX1251-NEXT: global_load_b128 v[0:3], v24, s[2:3] offset:400 |
| ; GFX1251-NEXT: global_load_b128 v[4:7], v24, s[2:3] offset:144 |
| ; GFX1251-NEXT: global_load_b128 v[8:11], v24, s[2:3] offset:128 |
| ; GFX1251-NEXT: global_load_b128 v[12:15], v24, s[2:3] offset:384 |
| ; GFX1251-NEXT: global_load_b128 v[16:19], v24, s[2:3] offset:432 |
| ; GFX1251-NEXT: global_load_b128 v[20:23], v24, s[2:3] offset:176 |
| ; GFX1251-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:160 |
| ; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:416 |
| ; GFX1251-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:464 |
| ; GFX1251-NEXT: global_load_b128 v[38:41], v24, s[2:3] offset:208 |
| ; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:192 |
| ; GFX1251-NEXT: global_load_b128 v[46:49], v24, s[2:3] offset:448 |
| ; GFX1251-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:496 |
| ; GFX1251-NEXT: global_load_b128 v[54:57], v24, s[2:3] offset:240 |
| ; GFX1251-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:224 |
| ; GFX1251-NEXT: global_load_b128 v[62:65], v24, s[2:3] offset:480 |
| ; GFX1251-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:16 |
| ; GFX1251-NEXT: global_load_b128 v[70:73], v24, s[2:3] |
| ; GFX1251-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:272 |
| ; GFX1251-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:256 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x12 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[2:3], v2, v[6:7] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[0:1], v0, v[4:5] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x10 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[6:7], v14, v[10:11] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[4:5], v12, v[8:9] |
| ; GFX1251-NEXT: s_wait_loadcnt 0xe |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[10:11], v18, v[22:23] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[8:9], v16, v[20:21] |
| ; GFX1251-NEXT: s_wait_loadcnt 0xc |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[14:15], v32, v[28:29] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[12:13], v30, v[26:27] |
| ; GFX1251-NEXT: global_load_b128 v[26:29], v24, s[2:3] offset:48 |
| ; GFX1251-NEXT: s_wait_loadcnt 0xb |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[18:19], v36, v[40:41] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[16:17], v34, v[38:39] |
| ; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:304 |
| ; GFX1251-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[22:23], v48, v[44:45] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[20:21], v46, v[42:43] |
| ; GFX1251-NEXT: s_clause 0x1 |
| ; GFX1251-NEXT: global_load_b128 v[34:37], v24, s[2:3] offset:80 |
| ; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:336 |
| ; GFX1251-NEXT: s_wait_loadcnt 0xa |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[40:41], v52, v[56:57] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[38:39], v50, v[54:55] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x8 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[48:49], v64, v[60:61] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[46:47], v62, v[58:59] |
| ; GFX1251-NEXT: global_load_b128 v[50:53], v24, s[2:3] offset:64 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[56:57], v76, v[68:69] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[54:55], v74, v[66:67] |
| ; GFX1251-NEXT: global_load_b128 v[58:61], v24, s[2:3] offset:32 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x6 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[64:65], v80, v[72:73] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[62:63], v78, v[70:71] |
| ; GFX1251-NEXT: global_load_b128 v[66:69], v24, s[2:3] offset:288 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x5 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[28:29], v32, v[28:29] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[26:27], v30, v[26:27] |
| ; GFX1251-NEXT: global_load_b128 v[30:33], v24, s[2:3] offset:320 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[36:37], v44, v[36:37] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[34:35], v42, v[34:35] |
| ; GFX1251-NEXT: s_clause 0x3 |
| ; GFX1251-NEXT: global_load_b128 v[42:45], v24, s[2:3] offset:368 |
| ; GFX1251-NEXT: global_load_b128 v[70:73], v24, s[2:3] offset:112 |
| ; GFX1251-NEXT: global_load_b128 v[74:77], v24, s[2:3] offset:96 |
| ; GFX1251-NEXT: global_load_b128 v[78:81], v24, s[2:3] offset:352 |
| ; GFX1251-NEXT: s_wait_loadcnt 0x4 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[32:33], v32, v[52:53] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[30:31], v30, v[50:51] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x2 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[44:45], v44, v[72:73] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[42:43], v42, v[70:71] |
| ; GFX1251-NEXT: s_wait_loadcnt 0x0 |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[72:73], v80, v[76:77] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[70:71], v78, v[74:75] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[52:53], v68, v[60:61] |
| ; GFX1251-NEXT: v_lshlrev_b64_e32 v[50:51], v66, v[58:59] |
| ; GFX1251-NEXT: s_clause 0xf |
| ; GFX1251-NEXT: global_store_b128 v24, v[70:73], s[0:1] offset:96 |
| ; GFX1251-NEXT: global_store_b128 v24, v[42:45], s[0:1] offset:112 |
| ; GFX1251-NEXT: global_store_b128 v24, v[30:33], s[0:1] offset:64 |
| ; GFX1251-NEXT: global_store_b128 v24, v[34:37], s[0:1] offset:80 |
| ; GFX1251-NEXT: global_store_b128 v24, v[50:53], s[0:1] offset:32 |
| ; GFX1251-NEXT: global_store_b128 v24, v[26:29], s[0:1] offset:48 |
| ; GFX1251-NEXT: global_store_b128 v24, v[62:65], s[0:1] |
| ; GFX1251-NEXT: global_store_b128 v24, v[54:57], s[0:1] offset:16 |
| ; GFX1251-NEXT: global_store_b128 v24, v[46:49], s[0:1] offset:224 |
| ; GFX1251-NEXT: global_store_b128 v24, v[38:41], s[0:1] offset:240 |
| ; GFX1251-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:192 |
| ; GFX1251-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:208 |
| ; GFX1251-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:160 |
| ; GFX1251-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:176 |
| ; GFX1251-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:128 |
| ; GFX1251-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:144 |
| ; GFX1251-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) |
| ; GFX1251-NEXT: s_endpgm |
| %tid = call i32 @llvm.amdgcn.workitem.id.x() |
| %tid.ext = sext i32 %tid to i64 |
| %in.gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %in, i64 %tid.ext |
| %out.gep = getelementptr inbounds <32 x i64>, ptr addrspace(1) %out, i64 %tid.ext |
| %b_ptr = getelementptr <32 x i64>, ptr addrspace(1) %in.gep, i32 1 |
| %a = load <32 x i64>, ptr addrspace(1) %in.gep |
| %b = load <32 x i64>, ptr addrspace(1) %b_ptr |
| %result = shl <32 x i64> %a, %b |
| store <32 x i64> %result, ptr addrspace(1) %out.gep |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() #1 |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind readnone } |