|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s | 
|  | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s | 
|  |  | 
|  | declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone | 
|  | declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone | 
|  |  | 
|  | define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { | 
|  | ; GFX9-LABEL: ctlz_i64_poison: | 
|  | ; GFX9:       ; %bb.0: | 
|  | ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2 | 
|  | ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0 | 
|  | ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: ctlz_i64_poison: | 
|  | ; GFX10:       ; %bb.0: | 
|  | ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_clause 0x7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] | 
|  | ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2 | 
|  | ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6 | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3 | 
|  | ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0 | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | %val = load i64, ptr addrspace(1) %arrayidx, align 1 | 
|  | %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone | 
|  | store i64 %ctlz, ptr addrspace(1) %out, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { | 
|  | ; GFX9-LABEL: ctlz_i64: | 
|  | ; GFX9:       ; %bb.0: | 
|  | ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2 | 
|  | ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0 | 
|  | ; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0 | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: ctlz_i64: | 
|  | ; GFX10:       ; %bb.0: | 
|  | ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_clause 0x7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] | 
|  | ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2 | 
|  | ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6 | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3 | 
|  | ; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0 | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0 | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | %val = load i64, ptr addrspace(1) %arrayidx, align 1 | 
|  | %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone | 
|  | store i64 %ctlz, ptr addrspace(1) %out, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { | 
|  | ; GFX9-LABEL: cttz_i64_poison: | 
|  | ; GFX9:       ; %bb.0: | 
|  | ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0 | 
|  | ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2 | 
|  | ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2 | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: cttz_i64_poison: | 
|  | ; GFX10:       ; %bb.0: | 
|  | ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_clause 0x7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6 | 
|  | ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0 | 
|  | ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2 | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2 | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | %val = load i64, ptr addrspace(1) %arrayidx, align 1 | 
|  | %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone | 
|  | store i64 %cttz, ptr addrspace(1) %out, align 8 | 
|  | ret void | 
|  | } | 
|  |  | 
|  | define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { | 
|  | ; GFX9-LABEL: cttz_i64: | 
|  | ; GFX9:       ; %bb.0: | 
|  | ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX9-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6 | 
|  | ; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7 | 
|  | ; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(5) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0 | 
|  | ; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2 | 
|  | ; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2 | 
|  | ; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0 | 
|  | ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX9-NEXT:    s_endpgm | 
|  | ; | 
|  | ; GFX10-LABEL: cttz_i64: | 
|  | ; GFX10:       ; %bb.0: | 
|  | ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
|  | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0 | 
|  | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | ; GFX10-NEXT:    s_clause 0x7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5 | 
|  | ; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7 | 
|  | ; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6 | 
|  | ; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1 | 
|  | ; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3 | 
|  | ; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4 | 
|  | ; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] | 
|  | ; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(7) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(6) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(4) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(3) | 
|  | ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(2) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6 | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7 | 
|  | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
|  | ; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0 | 
|  | ; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3 | 
|  | ; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0 | 
|  | ; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2 | 
|  | ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2 | 
|  | ; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0 | 
|  | ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1] | 
|  | ; GFX10-NEXT:    s_endpgm | 
|  | %val = load i64, ptr addrspace(1) %arrayidx, align 1 | 
|  | %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone | 
|  | store i64 %cttz, ptr addrspace(1) %out, align 8 | 
|  | ret void | 
|  | } | 
|  | ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: | 
|  | ; GCN: {{.*}} |