| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s |
| ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s |
| |
| define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 |
| ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 |
| ; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064 |
| ; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 |
| ; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 |
| ; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v32 |
| ; GCN-NEXT: ds_load_b128 v[8:11], v32 offset:2048 |
| ; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144 |
| ; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288 |
| ; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(4) |
| ; GCN-NEXT: v_mov_b32_e32 v47, v7 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(3) |
| ; GCN-NEXT: v_mov_b32_e32 v55, v15 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(2) |
| ; GCN-NEXT: v_mov_b32_e32 v63, v23 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(1) |
| ; GCN-NEXT: v_mov_b32_e32 v71, v31 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 |
| ; GCN-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 |
| ; GCN-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 |
| ; GCN-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 |
| ; GCN-NEXT: v_mov_b32_e32 v48, v8 |
| ; GCN-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 |
| ; GCN-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 |
| ; GCN-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 |
| ; GCN-NEXT: v_mov_b32_e32 v56, v16 |
| ; GCN-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 |
| ; GCN-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 |
| ; GCN-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 |
| ; GCN-NEXT: v_mov_b32_e32 v64, v24 |
| ; GCN-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 |
| ; GCN-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 |
| ; GCN-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 |
| ; GCN-NEXT: v_mov_b32_e32 v72, v32 |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47] |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55] |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63] |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71] |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79] |
| ; GCN-NEXT: ds_store_b128 v80, v[44:47] offset:16 |
| ; GCN-NEXT: ds_store_b128 v80, v[40:43] |
| ; GCN-NEXT: ds_store_b128 v81, v[52:55] offset:2064 |
| ; GCN-NEXT: ds_store_b128 v81, v[48:51] offset:2048 |
| ; GCN-NEXT: ds_store_b128 v81, v[60:63] offset:4112 |
| ; GCN-NEXT: ds_store_b128 v81, v[56:59] offset:4096 |
| ; GCN-NEXT: ds_store_b128 v81, v[68:71] offset:6160 |
| ; GCN-NEXT: ds_store_b128 v81, v[64:67] offset:6144 |
| ; GCN-NEXT: ds_store_b128 v81, v[76:79] offset:8208 |
| ; GCN-NEXT: ds_store_b128 v81, v[72:75] offset:8192 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(10) SyncID(0) |
| ; GCN-NEXT: s_endpgm |
| ; |
| ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: |
| ; EXACTCUTOFF: ; %bb.0: ; %entry |
| ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v32 offset:2048 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v47, v7 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v55, v15 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v63, v23 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v71, v31 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, v8 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v56, v16 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v64, v24 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v72, v32 |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[0:7], v[0:7], v[40:47] |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[8:15], v[8:15], v[48:55] |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[56:63], v[16:23], v[16:23], v[56:63] |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[64:71], v[24:31], v[24:31], v[64:71] |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[72:79], v[32:39], v[32:39], v[72:79] |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v80, v[44:47] offset:16 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v80, v[40:43] |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[52:55] offset:2064 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[48:51] offset:2048 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[60:63] offset:4112 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[56:59] offset:4096 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[68:71] offset:6160 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[64:67] offset:6144 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[76:79] offset:8208 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v81, v[72:75] offset:8192 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(10) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_endpgm |
| entry: |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx |
| %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr |
| %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 |
| %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr |
| %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 |
| %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr |
| %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 |
| %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr |
| %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 |
| %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr |
| %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) |
| %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) |
| %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) |
| %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) |
| %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) |
| %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx |
| store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 |
| store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 |
| store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 |
| store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 |
| store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| ; 10 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 10, i32 0) |
| ; 5 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) |
| ; 10 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 10, i32 0) |
| ret void |
| } |
| |
| define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 |
| ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:16 |
| ; GCN-NEXT: ds_store_b128 v16, v[8:11] |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:2064 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:2048 |
| ; GCN-NEXT: v_mov_b32_e32 v16, s1 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2064 |
| ; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:2048 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:6160 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:4112 |
| ; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:4096 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:12304 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:6160 |
| ; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:6144 |
| ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:20496 |
| ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:8208 |
| ; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:8192 |
| ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; GCN-NEXT: s_endpgm |
| ; |
| ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: |
| ; EXACTCUTOFF: ; %bb.0: ; %entry |
| ; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 |
| ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:16 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:2064 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:2048 |
| ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2064 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:2048 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:6160 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:4112 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:4096 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:12304 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:6160 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:6144 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:20496 |
| ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 |
| ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 |
| ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:8208 |
| ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:8192 |
| ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) |
| ; EXACTCUTOFF-NEXT: s_endpgm |
| entry: |
| %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx |
| %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr |
| %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 |
| %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr |
| %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 |
| %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr |
| %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 |
| %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr |
| %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 |
| %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr |
| %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) |
| %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) |
| %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) |
| %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) |
| %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) |
| %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx |
| store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 |
| store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 |
| store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 |
| store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 |
| store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| ; 2 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) |
| ; 1 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 2 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) |
| ; 2 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) |
| ; 1 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 2 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) |
| ; 2 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) |
| ; 1 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 2 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) |
| ; 2 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) |
| ; 1 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 2 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) |
| ; 2 DS read |
| call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) |
| ; 1 WMMA |
| call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| ; 2 DS write |
| call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) |
| ret void |
| } |
| |
| declare i32 @llvm.amdgcn.workitem.id.x() #2 |
| declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 |
| declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 |
| |
| attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } |
| attributes #1 = { nounwind } |
| attributes #2 = { nounwind readnone speculatable } |