| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s |
| |
| ; ModuleID = 'kernel_round1_passing.bc' |
| source_filename = "/tmp/comgr-295d04/input/CompileSource" |
| @kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1 |
| @kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4 |
| @kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4 |
| |
| ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) |
| declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: convergent nounwind |
| declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1 |
| |
| ; Function Attrs: convergent nounwind |
| declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1 |
| |
| ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) |
| declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: convergent nounwind |
| declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1 |
| |
| ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) |
| declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: convergent nounwind |
| declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1 |
| |
| ; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none) |
| declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0 |
| |
| ; Function Attrs: convergent norecurse nounwind |
| define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 { |
| ; CHECK-LABEL: kernel_round1: |
| ; CHECK: ; %bb.0: |
| ; CHECK-NEXT: s_add_u32 s12, s12, s17 |
| ; CHECK-NEXT: s_mov_b32 s32, 0 |
| ; CHECK-NEXT: s_addc_u32 s13, s13, 0 |
| ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 |
| ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 |
| ; CHECK-NEXT: s_load_dwordx8 s[64:71], s[8:9], 0x0 |
| ; CHECK-NEXT: s_add_u32 s0, s0, s17 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v0 |
| ; CHECK-NEXT: s_add_u32 s52, s34, 40 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_mov_b32 s33, s16 |
| ; CHECK-NEXT: s_addc_u32 s53, s35, 0 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b32 s12, s14 |
| ; CHECK-NEXT: s_mov_b32 s13, s15 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v45, 0 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v43, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 1 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: global_load_dword v0, v0, s[68:69] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, 12 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v42, v0 |
| ; CHECK-NEXT: s_mov_b32 s52, exec_lo |
| ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_25 |
| ; CHECK-NEXT: ; %bb.1: ; %.preheader5 |
| ; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14 |
| ; CHECK-NEXT: s_mov_b32 s4, 0 |
| ; CHECK-NEXT: s_mov_b32 s5, 0 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 |
| ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 |
| ; CHECK-NEXT: s_add_i32 s5, s5, 1 |
| ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 |
| ; CHECK-NEXT: ds_write_b8 v1, v45 |
| ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_2 |
| ; CHECK-NEXT: ; %bb.3: |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 |
| ; CHECK-NEXT: s_mov_b32 s53, 0 |
| ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 |
| ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_25 |
| ; CHECK-NEXT: ; %bb.4: |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v47, 0 |
| ; CHECK-NEXT: s_mov_b32 s55, 0 |
| ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 |
| ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 |
| ; CHECK-NEXT: s_lshl_b32 s4, s55, 5 |
| ; CHECK-NEXT: s_add_i32 s54, s55, 1 |
| ; CHECK-NEXT: s_add_i32 s5, s55, 5 |
| ; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_read_u8 v56, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v58, s54 |
| ; CHECK-NEXT: s_mov_b32 s68, exec_lo |
| ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_17 |
| ; CHECK-NEXT: ; %bb.6: ; %.preheader2 |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_mov_b32 s69, 0 |
| ; CHECK-NEXT: s_mov_b32 s80, 0 |
| ; CHECK-NEXT: s_branch .LBB0_8 |
| ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 |
| ; CHECK-NEXT: s_add_i32 s80, s80, 4 |
| ; CHECK-NEXT: s_add_i32 s4, s55, s80 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57 |
| ; CHECK-NEXT: s_add_i32 s5, s4, 5 |
| ; CHECK-NEXT: s_add_i32 s4, s4, 1 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 |
| ; CHECK-NEXT: v_mov_b32_e32 v58, s4 |
| ; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_16 |
| ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 |
| ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57 |
| ; CHECK-NEXT: ds_read_u8 v0, v59 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s81, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_10 |
| ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v58 |
| ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 |
| ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s81, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_12 |
| ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v60 |
| ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 |
| ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s81, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_14 |
| ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v60 |
| ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 |
| ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s81, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_7 |
| ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v58 |
| ; CHECK-NEXT: s_branch .LBB0_7 |
| ; CHECK-NEXT: .LBB0_16: ; %Flow45 |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 |
| ; CHECK-NEXT: v_mov_b32_e32 v57, v0 |
| ; CHECK-NEXT: .LBB0_17: ; %Flow46 |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 |
| ; CHECK-NEXT: s_mov_b32 s55, exec_lo |
| ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_23 |
| ; CHECK-NEXT: ; %bb.18: ; %.preheader |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_mov_b32 s68, 0 |
| ; CHECK-NEXT: s_inst_prefetch 0x1 |
| ; CHECK-NEXT: s_branch .LBB0_20 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 |
| ; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_22 |
| ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 |
| ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 |
| ; CHECK-NEXT: ds_read_u8 v0, v0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s69, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_19 |
| ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v57 |
| ; CHECK-NEXT: s_branch .LBB0_19 |
| ; CHECK-NEXT: .LBB0_22: ; %Flow43 |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_inst_prefetch 0x2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 |
| ; CHECK-NEXT: .LBB0_23: ; %Flow44 |
| ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 |
| ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 |
| ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 |
| ; CHECK-NEXT: s_mov_b32 s55, s54 |
| ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 |
| ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 s53, s4, s53 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 |
| ; CHECK-NEXT: .LBB0_25: ; %Flow51 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 1 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_mov_b32 s4, exec_lo |
| ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_33 |
| ; CHECK-NEXT: ; %bb.26: |
| ; CHECK-NEXT: s_mov_b32 s52, 0 |
| ; CHECK-NEXT: s_branch .LBB0_28 |
| ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v1, vcc_lo |
| ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 |
| ; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_33 |
| ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 |
| ; CHECK-NEXT: s_mov_b32 s53, exec_lo |
| ; CHECK-NEXT: ds_read_b32 v0, v0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0 |
| ; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5 |
| ; CHECK-NEXT: v_and_b32_e32 v72, 31, v0 |
| ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 |
| ; CHECK-NEXT: v_add_co_u32 v2, s4, s64, v1 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s65, 0, s4 |
| ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo |
| ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo |
| ; CHECK-NEXT: s_clause 0x1 |
| ; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8 |
| ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5 |
| ; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4 |
| ; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7 |
| ; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6 |
| ; CHECK-NEXT: v_or_b32_e32 v5, v46, v57 |
| ; CHECK-NEXT: v_or_b32_e32 v4, v45, v56 |
| ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_27 |
| ; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1 |
| ; CHECK-NEXT: s_clause 0x1 |
| ; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24 |
| ; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45 |
| ; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12 |
| ; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0 |
| ; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1 |
| ; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 15, v1 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 |
| ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73 |
| ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 |
| ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 |
| ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 |
| ; CHECK-NEXT: v_add_co_u32 v42, s4, s70, v0 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s71, 0, s4 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, v44 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, v42 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v43 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 |
| ; CHECK-NEXT: s_mov_b32 s4, exec_lo |
| ; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0 |
| ; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_31 |
| ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 |
| ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 |
| ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] |
| ; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[66:67] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63 |
| ; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56 |
| ; CHECK-NEXT: v_or_b32_e32 v3, v1, v3 |
| ; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46] |
| ; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10 |
| ; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62 |
| ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo |
| ; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] |
| ; CHECK-NEXT: v_or_b32_e32 v1, v11, v1 |
| ; CHECK-NEXT: ; implicit-def: $vgpr42 |
| ; CHECK-NEXT: ; implicit-def: $vgpr43 |
| ; CHECK-NEXT: ; implicit-def: $vgpr44 |
| ; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4 |
| ; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8 |
| ; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24 |
| ; CHECK-NEXT: .LBB0_31: ; %Flow |
| ; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1 |
| ; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB0_27 |
| ; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, v42 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v43 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, v44 |
| ; CHECK-NEXT: s_add_u32 s8, s34, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s35, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: s_branch .LBB0_27 |
| ; CHECK-NEXT: .LBB0_33: |
| ; CHECK-NEXT: s_endpgm |
| %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 |
| %7 = trunc i64 %6 to i32 |
| %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4 |
| %9 = trunc i64 %8 to i32 |
| %10 = mul i32 %9, 14 |
| %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10 |
| store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 |
| tail call void @_Z7barrierj(i32 noundef 1) #5 |
| %12 = lshr i64 %6, 3 |
| %13 = shl i32 %7, 2 |
| %14 = and i32 %13, 28 |
| %15 = and i64 %12, 536870911 |
| %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15 |
| %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11 |
| %18 = lshr i32 %17, %14 |
| %19 = and i32 %18, 15 |
| %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4 |
| %21 = icmp eq i32 %20, 0 |
| br i1 %21, label %119, label %27 |
| |
| 22: ; preds = %27 |
| %23 = add i32 %20, -1 |
| %24 = icmp eq i32 %23, 0 |
| br i1 %24, label %119, label %25 |
| |
| 25: ; preds = %22 |
| %26 = shl i32 %7, 10 |
| br label %37 |
| |
| 27: ; preds = %5, %27 |
| %28 = phi i32 [ %30, %27 ], [ 0, %5 ] |
| %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28 |
| store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15 |
| %30 = add nuw i32 %28, 1 |
| %31 = icmp eq i32 %30, %20 |
| br i1 %31, label %22, label %27 |
| |
| 32: ; preds = %114, %48 |
| %33 = phi i32 [ %50, %48 ], [ %115, %114 ] |
| %34 = icmp ult i32 %44, %23 |
| %35 = icmp ult i32 %33, 60 |
| %36 = select i1 %34, i1 %35, i1 false |
| br i1 %36, label %37, label %119 |
| |
| 37: ; preds = %32, %25 |
| %38 = phi i32 [ 0, %25 ], [ %44, %32 ] |
| %39 = phi i32 [ 0, %25 ], [ %33, %32 ] |
| %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38 |
| %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15 |
| %42 = shl i32 %38, 5 |
| %43 = or i32 %42, %26 |
| %44 = add nuw i32 %38, 1 |
| %45 = or i32 %43, %44 |
| %46 = add i32 %38, 5 |
| %47 = icmp ult i32 %46, %20 |
| br i1 %47, label %53, label %48 |
| |
| 48: ; preds = %98, %37 |
| %49 = phi i32 [ %45, %37 ], [ %100, %98 ] |
| %50 = phi i32 [ %39, %37 ], [ %99, %98 ] |
| %51 = phi i32 [ %44, %37 ], [ %54, %98 ] |
| %52 = icmp ult i32 %51, %20 |
| br i1 %52, label %103, label %32 |
| |
| 53: ; preds = %37, %98 |
| %54 = phi i32 [ %101, %98 ], [ %46, %37 ] |
| %55 = phi i32 [ %54, %98 ], [ %44, %37 ] |
| %56 = phi i32 [ %99, %98 ], [ %39, %37 ] |
| %57 = phi i32 [ %100, %98 ], [ %45, %37 ] |
| %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55 |
| %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15 |
| %60 = icmp eq i8 %41, %59 |
| br i1 %60, label %61, label %65 |
| |
| 61: ; preds = %53 |
| %62 = add i32 %56, 1 |
| %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63 |
| store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11 |
| br label %65 |
| |
| 65: ; preds = %61, %53 |
| %66 = phi i32 [ %62, %61 ], [ %56, %53 ] |
| %67 = add i32 %55, 1 |
| %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67 |
| %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15 |
| %70 = icmp eq i8 %41, %69 |
| br i1 %70, label %71, label %76 |
| |
| 71: ; preds = %65 |
| %72 = add i32 %57, 1 |
| %73 = add i32 %66, 1 |
| %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74 |
| store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11 |
| br label %76 |
| |
| 76: ; preds = %71, %65 |
| %77 = phi i32 [ %73, %71 ], [ %66, %65 ] |
| %78 = add i32 %55, 2 |
| %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78 |
| %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15 |
| %81 = icmp eq i8 %41, %80 |
| br i1 %81, label %82, label %87 |
| |
| 82: ; preds = %76 |
| %83 = add i32 %57, 2 |
| %84 = add i32 %77, 1 |
| %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85 |
| store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11 |
| br label %87 |
| |
| 87: ; preds = %82, %76 |
| %88 = phi i32 [ %84, %82 ], [ %77, %76 ] |
| %89 = add i32 %55, 3 |
| %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89 |
| %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15 |
| %92 = icmp eq i8 %41, %91 |
| br i1 %92, label %93, label %98 |
| |
| 93: ; preds = %87 |
| %94 = add i32 %57, 3 |
| %95 = add i32 %88, 1 |
| %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96 |
| store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11 |
| br label %98 |
| |
| 98: ; preds = %93, %87 |
| %99 = phi i32 [ %95, %93 ], [ %88, %87 ] |
| %100 = add i32 %57, 4 |
| %101 = add i32 %54, 4 |
| %102 = icmp ult i32 %101, %20 |
| br i1 %102, label %53, label %48 |
| |
| 103: ; preds = %48, %114 |
| %104 = phi i32 [ %117, %114 ], [ %51, %48 ] |
| %105 = phi i32 [ %115, %114 ], [ %50, %48 ] |
| %106 = phi i32 [ %116, %114 ], [ %49, %48 ] |
| %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104 |
| %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15 |
| %109 = icmp eq i8 %41, %108 |
| br i1 %109, label %110, label %114 |
| |
| 110: ; preds = %103 |
| %111 = add i32 %105, 1 |
| %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112 |
| store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11 |
| br label %114 |
| |
| 114: ; preds = %110, %103 |
| %115 = phi i32 [ %111, %110 ], [ %105, %103 ] |
| %116 = add i32 %106, 1 |
| %117 = add nuw i32 %104, 1 |
| %118 = icmp ult i32 %117, %20 |
| br i1 %118, label %103, label %32 |
| |
| 119: ; preds = %32, %22, %5 |
| tail call void @_Z7barrierj(i32 noundef 1) #5 |
| %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 |
| %121 = icmp ugt i32 %120, %9 |
| br i1 %121, label %122, label %206 |
| |
| 122: ; preds = %119 |
| %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8 |
| br label %124 |
| |
| 124: ; preds = %201, %122 |
| %125 = phi i32 [ %9, %122 ], [ %204, %201 ] |
| %126 = phi i64 [ %8, %122 ], [ %203, %201 ] |
| %127 = and i64 %126, 4294967295 |
| %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125 |
| %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11 |
| %130 = lshr i32 %129, 10 |
| %131 = lshr i32 %129, 5 |
| %132 = and i32 %131, 31 |
| %133 = and i32 %129, 31 |
| %134 = mul nuw nsw i32 %130, 384 |
| %135 = zext i32 %134 to i64 |
| %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135 |
| %137 = shl nuw nsw i32 %132, 5 |
| %138 = zext i32 %137 to i64 |
| %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138 |
| %140 = shl nuw nsw i32 %133, 5 |
| %141 = zext i32 %140 to i64 |
| %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141 |
| %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1 |
| %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16 |
| %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1 |
| %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16 |
| %147 = xor i64 %146, %144 |
| %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16 |
| %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16 |
| %150 = xor i64 %149, %148 |
| %151 = icmp ne i64 %147, 0 |
| %152 = icmp ne i64 %150, 0 |
| %153 = select i1 %151, i1 true, i1 %152 |
| br i1 %153, label %154, label %201 |
| |
| 154: ; preds = %124 |
| %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2 |
| %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16 |
| %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2 |
| %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16 |
| %159 = and i64 %147, 983040 |
| %160 = shl i64 %147, 4 |
| %161 = and i64 %160, 61440 |
| %162 = or i64 %161, %159 |
| %163 = lshr i64 %147, 12 |
| %164 = and i64 %163, 3840 |
| %165 = or i64 %162, %164 |
| %166 = and i64 %160, 240 |
| %167 = or i64 %165, %166 |
| %168 = and i64 %163, 15 |
| %169 = or i64 %167, %168 |
| %170 = trunc i64 %169 to i32 |
| %171 = lshr i64 %169, 3 |
| %172 = shl nuw nsw i32 %170, 2 |
| %173 = and i32 %172, 28 |
| %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171 |
| %175 = shl nuw nsw i32 1, %173 |
| %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5 |
| %177 = lshr i32 %176, %173 |
| %178 = and i32 %177, 15 |
| %179 = icmp ugt i32 %178, 11 |
| br i1 %179, label %180, label %182 |
| |
| 180: ; preds = %154 |
| %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5 |
| br label %201 |
| |
| 182: ; preds = %154 |
| %183 = xor i64 %158, %156 |
| %184 = lshr i64 %183, 16 |
| %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48) |
| %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48) |
| %187 = shl nuw nsw i32 %133, 6 |
| %188 = shl i32 %130, 12 |
| %189 = or i32 %187, %188 |
| %190 = or i32 %189, %132 |
| %191 = mul nuw nsw i64 %169, 384 |
| %192 = and i64 %191, 4294967168 |
| %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192 |
| %194 = shl nuw nsw i32 %178, 5 |
| %195 = or disjoint i32 %194, 8 |
| %196 = zext i32 %195 to i64 |
| %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196 |
| %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4 |
| store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11 |
| store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16 |
| %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8 |
| store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16 |
| %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16 |
| store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16 |
| br label %201 |
| |
| 201: ; preds = %182, %180, %124 |
| %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4 |
| %203 = add i64 %202, %127 |
| %204 = trunc i64 %203 to i32 |
| %205 = icmp ugt i32 %120, %204 |
| br i1 %205, label %124, label %206 |
| |
| 206: ; preds = %201, %119 |
| ret void |
| } |
| |
| ; Removed most of the if-else blocks |
| |
| define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 { |
| ; CHECK-LABEL: kernel_round1_short: |
| ; CHECK: ; %bb.0: ; %.5 |
| ; CHECK-NEXT: s_add_u32 s12, s12, s17 |
| ; CHECK-NEXT: s_mov_b32 s32, 0 |
| ; CHECK-NEXT: s_addc_u32 s13, s13, 0 |
| ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 |
| ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 |
| ; CHECK-NEXT: s_load_dwordx2 s[54:55], s[8:9], 0x10 |
| ; CHECK-NEXT: s_add_u32 s0, s0, s17 |
| ; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v40, v0 |
| ; CHECK-NEXT: s_add_u32 s52, s38, 40 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_mov_b32 s33, s16 |
| ; CHECK-NEXT: s_addc_u32 s53, s39, 0 |
| ; CHECK-NEXT: s_mov_b32 s51, s14 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b32 s12, s14 |
| ; CHECK-NEXT: s_mov_b32 s13, s15 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_mov_b32 s50, s15 |
| ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] |
| ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] |
| ; CHECK-NEXT: v_mov_b32_e32 v43, 0 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v42, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 1 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 |
| ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: global_load_dword v0, v0, s[54:55] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, 12 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_mov_b32_e32 v41, v0 |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 |
| ; CHECK-NEXT: s_mov_b32 s52, 0 |
| ; CHECK-NEXT: s_mov_b32 s4, 0 |
| ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 |
| ; CHECK-NEXT: .LBB1_1: ; %.37 |
| ; CHECK-NEXT: ; =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 |
| ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 |
| ; CHECK-NEXT: s_lshl_b32 s5, s4, 5 |
| ; CHECK-NEXT: s_add_i32 s53, s4, 1 |
| ; CHECK-NEXT: s_add_i32 s6, s4, 5 |
| ; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: ds_read_u8 v46, v0 |
| ; CHECK-NEXT: v_mov_b32_e32 v56, s53 |
| ; CHECK-NEXT: s_mov_b32 s5, exec_lo |
| ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_5 |
| ; CHECK-NEXT: ; %bb.2: ; %.53.preheader |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_mov_b32 s6, 0 |
| ; CHECK-NEXT: s_mov_b32 s7, 0 |
| ; CHECK-NEXT: .LBB1_3: ; %.53 |
| ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: s_add_i32 s7, s7, 4 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 |
| ; CHECK-NEXT: s_add_i32 s8, s4, s7 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47 |
| ; CHECK-NEXT: s_add_i32 s9, s8, 5 |
| ; CHECK-NEXT: s_add_i32 s8, s8, 1 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 |
| ; CHECK-NEXT: v_mov_b32_e32 v56, s8 |
| ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 |
| ; CHECK-NEXT: ; %bb.4: ; %Flow3 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 |
| ; CHECK-NEXT: v_mov_b32_e32 v47, v0 |
| ; CHECK-NEXT: .LBB1_5: ; %Flow4 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 |
| ; CHECK-NEXT: s_mov_b32 s54, exec_lo |
| ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_11 |
| ; CHECK-NEXT: ; %bb.6: ; %.103.preheader |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_mov_b32 s55, 0 |
| ; CHECK-NEXT: s_inst_prefetch 0x1 |
| ; CHECK-NEXT: s_branch .LBB1_8 |
| ; CHECK-NEXT: .p2align 6 |
| ; CHECK-NEXT: .LBB1_7: ; %.114 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 |
| ; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_10 |
| ; CHECK-NEXT: .LBB1_8: ; %.103 |
| ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 |
| ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 |
| ; CHECK-NEXT: ds_read_u8 v0, v0 |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD |
| ; CHECK-NEXT: s_and_saveexec_b32 s64, s4 |
| ; CHECK-NEXT: s_cbranch_execz .LBB1_7 |
| ; CHECK-NEXT: ; %bb.9: ; %.110 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 |
| ; CHECK-NEXT: s_add_u32 s8, s38, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s39, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
| ; CHECK-NEXT: ds_write_b32 v0, v47 |
| ; CHECK-NEXT: s_branch .LBB1_7 |
| ; CHECK-NEXT: .LBB1_10: ; %Flow |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_inst_prefetch 0x2 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 |
| ; CHECK-NEXT: .LBB1_11: ; %Flow2 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 |
| ; CHECK-NEXT: ; %bb.12: ; %.32 |
| ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 |
| ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 |
| ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 |
| ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 |
| ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 |
| ; CHECK-NEXT: s_or_b32 s52, s4, s52 |
| ; CHECK-NEXT: s_mov_b32 s4, s53 |
| ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 |
| ; CHECK-NEXT: ; %bb.13: ; %.119 |
| ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 |
| ; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 1 |
| ; CHECK-NEXT: s_add_u32 s8, s38, 40 |
| ; CHECK-NEXT: s_addc_u32 s9, s39, 0 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| ; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] |
| ; CHECK-NEXT: s_mov_b32 s12, s51 |
| ; CHECK-NEXT: s_mov_b32 s13, s50 |
| ; CHECK-NEXT: s_mov_b32 s14, s33 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: s_endpgm |
| .5: |
| %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 |
| %.7 = trunc i64 %.6 to i32 |
| %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4 |
| %.9 = trunc i64 %.8 to i32 |
| %.10 = mul i32 %.9, 14 |
| %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10 |
| store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 |
| tail call void @_Z7barrierj(i32 noundef 1) #5 |
| %.12 = lshr i64 %.6, 3 |
| %.13 = shl i32 %.7, 2 |
| %.14 = and i32 %.13, 28 |
| %.15 = and i64 %.12, 536870911 |
| %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15 |
| %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11 |
| %.18 = lshr i32 %.17, %.14 |
| %.19 = and i32 %.18, 15 |
| %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4 |
| %.21 = icmp eq i32 %.20, 0 |
| %.23 = add i32 %.20, -1 |
| %.24 = icmp eq i32 %.23, 0 |
| store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15 |
| br label %.37 |
| |
| .32: ; preds = %.114, %.48 |
| %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ] |
| %.34 = icmp ult i32 %.44, %.23 |
| %.35 = icmp ult i32 %.33, 60 |
| %.36 = select i1 %.34, i1 %.35, i1 false |
| br i1 %.36, label %.37, label %.119 |
| |
| .37: ; preds = %.32, %.25 |
| %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ] |
| %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ] |
| %.26 = shl i32 %.7, 10 |
| %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38 |
| %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15 |
| %.42 = shl i32 %.38, 5 |
| %.43 = or i32 %.42, %.26 |
| %.44 = add nuw i32 %.38, 1 |
| %.45 = or i32 %.43, %.44 |
| %.46 = add i32 %.38, 5 |
| %.47 = icmp ult i32 %.46, %.20 |
| br i1 %.47, label %.53, label %.48 |
| |
| .48: ; preds = %.98, %.37 |
| %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ] |
| %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ] |
| %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ] |
| %.52 = icmp ult i32 %.51, %.20 |
| br i1 %.52, label %.103, label %.32 |
| |
| .53: ; preds = %.37, %.98 |
| %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ] |
| %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ] |
| %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ] |
| %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ] |
| %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55 |
| %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15 |
| %.60 = icmp eq i8 %.41, %.59 |
| br label %.98 |
| |
| .98: ; preds = %.93, %.87 |
| %.99 = add i32 %.56, 1 |
| %.100 = add i32 %.57, 4 |
| %.101 = add i32 %.54, 4 |
| %.102 = icmp ult i32 %.101, %.20 |
| br i1 %.102, label %.53, label %.48 |
| |
| .103: ; preds = %.48, %.114 |
| %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ] |
| %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ] |
| %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ] |
| %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104 |
| %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15 |
| %.109 = icmp eq i8 %.41, %.108 |
| br i1 %.109, label %.110, label %.114 |
| |
| .110: ; preds = %.103 |
| %.111 = add i32 %.105, 1 |
| %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5 |
| %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112 |
| store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11 |
| br label %.114 |
| |
| .114: ; preds = %.110, %.103 |
| %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ] |
| %.116 = add i32 %.106, 1 |
| %.117 = add nuw i32 %.104, 1 |
| %.118 = icmp ult i32 %.117, %.20 |
| br i1 %.118, label %.103, label %.32 |
| |
| .119: ; preds = %.32, %.22, %.5 |
| tail call void @_Z7barrierj(i32 noundef 1) #5 |
| %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11 |
| %.121 = icmp ugt i32 %.120, %.9 |
| br label %.206 |
| |
| .206: ; preds = %.201, %.119 |
| ret void |
| } |
| |
| ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| declare i64 @llvm.fshl.i64(i64, i64, i64) #3 |
| |
| attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } |
| attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } |
| attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" } |
| attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
| attributes #4 = { convergent nounwind willreturn memory(none) } |
| attributes #5 = { convergent nounwind } |
| |
| !llvm.module.flags = !{!0, !1, !2} |
| !opencl.ocl.version = !{!3} |
| !llvm.ident = !{!4} |
| |
| !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} |
| !1 = !{i32 1, !"wchar_size", i32 4} |
| !2 = !{i32 8, !"PIC Level", i32 2} |
| !3 = !{i32 1, i32 2} |
| !4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"} |
| !5 = !{i32 1, i32 1, i32 1, i32 1, i32 1} |
| !6 = !{!"none", !"none", !"none", !"none", !"none"} |
| !7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"} |
| !8 = !{!"", !"", !"", !"", !""} |
| !9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"} |
| !10 = !{i32 64, i32 1, i32 1} |
| !11 = !{!12, !12, i64 0} |
| !12 = !{!"int", !13, i64 0} |
| !13 = !{!"omnipotent char", !14, i64 0} |
| !14 = !{!"Simple C/C++ TBAA"} |
| !15 = !{!13, !13, i64 0} |
| !16 = !{!17, !17, i64 0} |
| !17 = !{!"long", !13, i64 0} |