| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10 | 
 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx908  < %s | FileCheck %s -check-prefixes=GFX9 | 
 |  | 
 | define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle6766: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6060706 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle6766: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x6060706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle3744: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x307 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle3744: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_movk_i32 s4, 0x307 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle4445: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040404 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle4445: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040404 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle0101: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle0101: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle1004: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x40405 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle1004: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x40405 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 |  | 
 | define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { | 
 | ; GFX10-LABEL: shuffle7533: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    flat_load_dword v6, v[0:1] | 
 | ; GFX10-NEXT:    flat_load_dword v7, v[2:3] | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3030507 | 
 | ; GFX10-NEXT:    flat_store_dword v[4:5], v0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle7533: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    flat_load_dword v6, v[0:1] | 
 | ; GFX9-NEXT:    flat_load_dword v7, v[2:3] | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3030507 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    flat_store_dword v[4:5], v0 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { | 
 | ; GFX10-LABEL: shuffle7767: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    flat_load_dword v0, v[2:3] | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060707 | 
 | ; GFX10-NEXT:    flat_store_dword v[4:5], v0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle7767: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    flat_load_dword v0, v[2:3] | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060707 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    flat_store_dword v[4:5], v0 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { | 
 | ; GFX10-LABEL: shuffle0554: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    ds_read_b32 v0, v0 | 
 | ; GFX10-NEXT:    ds_read_b32 v1, v1 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x10104 | 
 | ; GFX10-NEXT:    ds_write_b32 v2, v0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle0554: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    ds_read_b32 v0, v0 | 
 | ; GFX9-NEXT:    ds_read_b32 v1, v1 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x10104 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4 | 
 | ; GFX9-NEXT:    ds_write_b32 v2, v0 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { | 
 | ; GFX10-LABEL: shuffle2127: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    ds_read_b32 v0, v0 | 
 | ; GFX10-NEXT:    ds_read_b32 v1, v1 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3060506 | 
 | ; GFX10-NEXT:    ds_write_b32 v2, v0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle2127: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    ds_read_b32 v0, v0 | 
 | ; GFX9-NEXT:    ds_read_b32 v1, v1 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3060506 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4 | 
 | ; GFX9-NEXT:    ds_write_b32 v2, v0 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) { | 
 | ; GFX10-LABEL: shuffle5047: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_clause 0x1 | 
 | ; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen | 
 | ; GFX10-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v4, v3, 0x7040005 | 
 | ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle5047: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen | 
 | ; GFX9-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7040005 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v4, v3, s4 | 
 | ; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle3546: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2000107 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle3546: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2000107 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle7330ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x4070706 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle7330ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x4070706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 7, i32 3, i32 3, i32 0> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle5341ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle5341ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 5, i32 3, i32 4, i32 1> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle6106ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle6106ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 6, i32 1, i32 0, i32 6> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle4327ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle4327ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 4, i32 3, i32 2, i32 7> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle3263ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060607 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle3263ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060607 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 6, i32 3> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle2763ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle2763ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 2, i32 7, i32 6, i32 3> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle1327ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060705 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle1327ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060705 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 7> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: shuffle0605ud2: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shuffle0605ud2: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 0, i32 6, i32 0, i32 5> | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: insertUsesOr: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: insertUsesOr: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: addUsesOr: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v7 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v7 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v2, v2, v3 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v1, v4, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: addUsesOr: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 7, i32 0, i32 6, i32 3> | 
 |   %added = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %added, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 { | 
 | ; GFX10-LABEL: shuffle8i8: | 
 | ; GFX10:       ; %bb.0: ; %bb | 
 | ; GFX10-NEXT:    s_clause 0x1 | 
 | ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
 | ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v2, 0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
 | ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0 | 
 | ; GFX10-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_bfe_u32 s2, s5, 0x80008 | 
 | ; GFX10-NEXT:    s_lshl_b32 s1, s9, 8 | 
 | ; GFX10-NEXT:    s_bfe_u32 s9, s9, 0x100010 | 
 | ; GFX10-NEXT:    s_bfe_u32 s0, s4, 0x80008 | 
 | ; GFX10-NEXT:    s_lshl_b32 s3, s8, 8 | 
 | ; GFX10-NEXT:    s_and_b32 s5, s8, 0xff00 | 
 | ; GFX10-NEXT:    s_bfe_u32 s8, s4, 0x80010 | 
 | ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff | 
 | ; GFX10-NEXT:    s_or_b32 s1, s2, s1 | 
 | ; GFX10-NEXT:    s_lshl_b32 s2, s9, 8 | 
 | ; GFX10-NEXT:    s_or_b32 s0, s0, s3 | 
 | ; GFX10-NEXT:    s_or_b32 s3, s8, s5 | 
 | ; GFX10-NEXT:    s_or_b32 s2, s4, s2 | 
 | ; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff | 
 | ; GFX10-NEXT:    s_lshl_b32 s1, s1, 16 | 
 | ; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff | 
 | ; GFX10-NEXT:    s_lshl_b32 s3, s3, 16 | 
 | ; GFX10-NEXT:    s_or_b32 s0, s0, s1 | 
 | ; GFX10-NEXT:    s_or_b32 s1, s2, s3 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7] | 
 | ; GFX10-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX9-LABEL: shuffle8i8: | 
 | ; GFX9:       ; %bb.0: ; %bb | 
 | ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24 | 
 | ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v2, 0 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0 | 
 | ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0 | 
 | ; GFX9-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x80008 | 
 | ; GFX9-NEXT:    s_lshl_b32 s1, s9, 8 | 
 | ; GFX9-NEXT:    s_bfe_u32 s2, s5, 0x80008 | 
 | ; GFX9-NEXT:    s_lshl_b32 s3, s8, 8 | 
 | ; GFX9-NEXT:    s_or_b32 s1, s2, s1 | 
 | ; GFX9-NEXT:    s_or_b32 s0, s0, s3 | 
 | ; GFX9-NEXT:    s_bfe_u32 s2, s4, 0x80010 | 
 | ; GFX9-NEXT:    s_and_b32 s3, s4, 0xff | 
 | ; GFX9-NEXT:    s_bfe_u32 s4, s9, 0x100010 | 
 | ; GFX9-NEXT:    s_and_b32 s5, s8, 0xff00 | 
 | ; GFX9-NEXT:    s_lshl_b32 s4, s4, 8 | 
 | ; GFX9-NEXT:    s_or_b32 s2, s2, s5 | 
 | ; GFX9-NEXT:    s_or_b32 s3, s3, s4 | 
 | ; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff | 
 | ; GFX9-NEXT:    s_lshl_b32 s2, s2, 16 | 
 | ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff | 
 | ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16 | 
 | ; GFX9-NEXT:    s_or_b32 s2, s3, s2 | 
 | ; GFX9-NEXT:    s_or_b32 s0, s0, s1 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v1, s2 | 
 | ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7] | 
 | ; GFX9-NEXT:    s_endpgm | 
 | bb: | 
 |   %vec0 = load <8 x i8>, ptr addrspace(1) %in0 | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in1 | 
 |   %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9> | 
 |   store <8 x i8> %shuffle0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone | 
 | declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone | 
 |  | 
 | ; Not combined to perm due to non-vectorized use, non-divergent | 
 | define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: add: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v7 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v7 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v2, v7, v2 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v3, v3, v7 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v1, v1, v4 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: add: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Not combined to perm due to non-vectorized use | 
 | define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: add_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v1, v1, v7 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: add_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Not combined to perm due to non-divergent use | 
 | define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: add_store: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffff00 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: add_store: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_movk_i32 s4, 0xff00 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Not combined to perm due to 16 bit or | 
 | define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: add_store_div_16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffff00 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: add_store_div_16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_movk_i32 s4, 0xff00 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Vectorized use, divergent, 32 bit or | 
 | define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: add_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v9 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v2, v9, v2 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v3, v3, v9 | 
 | ; GFX10-NEXT:    v_add_nc_u16 v1, v1, v10 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x10705 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: add_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x10705 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4> | 
 |   %vecins = add <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: and_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 2 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, 1 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_and_b32_e32 v3, 0x100, v9 | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5070006 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: and_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5070006 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 2 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v1, 1 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_e32 v9, 0x100, v4 | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v2, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1> | 
 |   %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: ashr_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v2, 26 | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_bfe_i32 v1, v9, 0, 8 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 25, v9 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 7, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v4, 10, v0 | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v9, v0, 0x4010707 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: ashr_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v1, 7 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x4010707 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 26 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 25, v4 | 
 | ; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 10, v9 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v2, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0> | 
 |   %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: bc_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x7060104 | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: bc_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x7060104 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7> | 
 |   %insvec = bitcast <4 x i8> %shuffle0_0 to i32 | 
 |   store i32 %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { | 
 | ; GFX10-LABEL: eve_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v5, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v5, v4, 0x1020305 | 
 | ; GFX10-NEXT:    global_store_byte v[9:10], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: eve_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v5, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1020305 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4 | 
 | ; GFX9-NEXT:    global_store_byte v[9:10], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1> | 
 |   %tmp = extractelement <4 x i8> %shuffle0_0, i32 1 | 
 |   store i8 %tmp, ptr addrspace(1) %out2 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Not combined to perm due to multi use of or operands (introduced by insert op) | 
 | define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: ive_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v9, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 2, v9 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v9 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v9 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v10, v[2:3], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 16 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v10, v9, 0x2000706 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: ive_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v9, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 2, v9 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v9 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v10, v[2:3], off | 
 | ; GFX9-NEXT:    s_movk_i32 s4, 0xff | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v4 | 
 | ; GFX9-NEXT:    s_mov_b32 s5, 0x2000706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v9 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_perm_b32 v3, v10, v9, s5 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v3, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2> | 
 |   %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: lhsr_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 26 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 25, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 26, v4 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f00, v1 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x1030707 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: lhsr_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 26 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1030707 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshrrev_b16_e32 v3, 1, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 25, v9 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 26, v4 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f00, v3 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1> | 
 |   %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: mul_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v9 | 
 | ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v2 | 
 | ; GFX10-NEXT:    v_mul_lo_u16 v1, v3, v1 | 
 | ; GFX10-NEXT:    v_mul_lo_u16 v2, v4, v9 | 
 | ; GFX10-NEXT:    v_mul_lo_u16 v3, v9, v3 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2000504 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: mul_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2000504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v4, v9 | 
 | ; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6> | 
 |   %vecins = mul <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: or_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 16 | 
 | ; GFX10-NEXT:    v_bfrev_b32_e32 v2, 4.0 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, 0x201, v1 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x2010005 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: or_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v2, v[2:3], off | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2010005 | 
 | ; GFX9-NEXT:    s_movk_i32 s5, 0x102 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2 | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v2 | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v4, v0, v2, s4 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, 0x201, v0 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v4, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6> | 
 |   %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: sdiv_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v10 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v14 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v15, v2, v15 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v16, v19, v16 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 30, v3 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v17, v2, v17 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v15, v15 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v16, v16 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v18, v1, v18 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v17, v17 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11 | 
 | ; GFX10-NEXT:    v_mad_f32 v20, -v15, v1, v2 | 
 | ; GFX10-NEXT:    v_mad_f32 v19, -v16, v10, v19 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v3, 1, v3 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v18, v18 | 
 | ; GFX10-NEXT:    v_mad_f32 v2, -v17, v12, v2 | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 30, v13 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11 | 
 | ; GFX10-NEXT:    v_mad_f32 v21, -v18, v14, v1 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v15, v15 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| | 
 | ; GFX10-NEXT:    v_or_b32_e32 v13, 1, v13 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v15, v0 | 
 | ; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v17, v2 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v13, vcc_lo | 
 | ; GFX10-NEXT:    v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x60706 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: sdiv_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x60706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v12 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v13 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v4 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v16 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v18, v2, v18 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v16, v16 | 
 | ; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 30, v10 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v18, v18 | 
 | ; GFX9-NEXT:    v_mad_f32 v11, -v16, v12, v11 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v2| | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 30, v9 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v10, 1, v10 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v16, v16 | 
 | ; GFX9-NEXT:    v_mad_f32 v3, -v17, v13, v3 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_mad_f32 v2, -v18, v4, v2 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18 | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v11|, |v12| | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v9, 1, v9 | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v13| | 
 | ; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14 | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v4| | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v14, vcc | 
 | ; GFX9-NEXT:    v_add_u32_e32 v1, v15, v1 | 
 | ; GFX9-NEXT:    v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_add_u32_e32 v3, v17, v3 | 
 | ; GFX9-NEXT:    v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4> | 
 |   %vecins = sdiv <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: sext_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9 | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v2, 8, v4 | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0 | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v3, 8, v1 | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v0, v2, 0x5040100 | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v3, v3, 0x5040100 | 
 | ; GFX10-NEXT:    v_perm_b32 v2, v9, v4, 0x3010707 | 
 | ; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v2, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: sext_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 8 | 
 | ; GFX9-NEXT:    s_mov_b32 s5, 0x5040100 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3010707 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 8, v9 | 
 | ; GFX9-NEXT:    v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s5 | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s5 | 
 | ; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4 | 
 | ; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v2, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7> | 
 |   %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16> | 
 |   store <4 x i16> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: shl_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v0, 2, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v9 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v0 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v3, 0xfe, v1 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xfffffe00, v1 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xfc, v0 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5000104 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shl_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5000104 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 2, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 1, v9 | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v3, 0xfffffc00, v1 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0xfe, v2 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffe00, v2 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xfc, v1 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5> | 
 |   %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: sitofp_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_ashrrev_i16 v10, 8, v9 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_perm_b32 v4, v9, v4, 0x6010205 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 | 
 | ; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v4, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: sitofp_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x6010205 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 8, v9 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 | 
 | ; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s4 | 
 | ; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v4, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6> | 
 |   %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float> | 
 |   store <4 x float> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: srem_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v2 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v13 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v3 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v17, v3, v17 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v18, v12, v18 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v19, v15, v19 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v17, v17 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v18, v18 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v20, v21, v20 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v19, v19 | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14 | 
 | ; GFX10-NEXT:    v_mad_f32 v22, -v17, v2, v3 | 
 | ; GFX10-NEXT:    v_mad_f32 v12, -v18, v13, v12 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v20, v20 | 
 | ; GFX10-NEXT:    v_mad_f32 v23, -v19, v3, v15 | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| | 
 | ; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14 | 
 | ; GFX10-NEXT:    v_mad_f32 v21, -v20, v15, v21 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| | 
 | ; GFX10-NEXT:    v_or_b32_e32 v16, 1, v16 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19 | 
 | ; GFX10-NEXT:    v_cvt_i32_f32_e32 v20, v20 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v4 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v17, v1 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v18, v2 | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v19, v3 | 
 | ; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v10 | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v11, v20, v11 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v10, v11, v12 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v3 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2070306 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: srem_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2070306 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v3 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v17, v10, v17 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_mad_f32 v19, -v17, v3, v10 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v3| | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v14 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v19, v10 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v3, v13, v3 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3 | 
 | ; GFX9-NEXT:    v_mad_f32 v13, -v3, v14, v13 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v13|, |v14| | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v15 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v15, v16, v19 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_mad_f32 v19, -v15, v10, v16 | 
 | ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v19|, |v10| | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v16 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v19, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v10, v13, v10 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v10, v10 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_mad_f32 v13, -v10, v16, v13 | 
 | ; GFX9-NEXT:    v_cvt_i32_f32_e32 v10, v10 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 30, v12 | 
 | ; GFX9-NEXT:    v_ashrrev_i32_e32 v19, 30, v19 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v12, 1, v12 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v19, 1, v19 | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v16| | 
 | ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[4:5] | 
 | ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, v14, s[6:7] | 
 | ; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v19, vcc | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v4 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v4 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v2, v17, v2 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v12 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v12, v15, v14 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v10, v10, v13 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v11 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v4, v12, v0 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v10, v10, v18 | 
 | ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2 | 
 | ; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_sub_u32_e32 v3, v18, v4 | 
 | ; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2> | 
 |   %vecins = srem <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: sub_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v2, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2 | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v2 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v2 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_sub_nc_u16 v3, v0, v3 | 
 | ; GFX10-NEXT:    v_sub_nc_u16 v9, v1, v4 | 
 | ; GFX10-NEXT:    v_sub_nc_u16 v10, v4, v2 | 
 | ; GFX10-NEXT:    v_sub_nc_u16 v1, v4, v1 | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x6070007 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v4, 8, v9 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: sub_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x6070007 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | 
 | ; GFX9-NEXT:    v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6> | 
 |   %vecins = sub <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { | 
 | ; GFX10-LABEL: sv_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v5, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x50705 | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: sv_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v5, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x50705 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4> | 
 |   %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0> | 
 |   store <4 x i8> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: trunc_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 1 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v2, 2, v0 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v1, 3, v4 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1 | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x50205 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0 | 
 | ; GFX10-NEXT:    global_store_byte v[7:8], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: trunc_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 1 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x50205 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 3, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 2, v2 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0 | 
 | ; GFX9-NEXT:    global_store_byte v[7:8], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0> | 
 |   %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1> | 
 |   store <4 x i1> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: udiv: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v2, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v14, v0 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4 | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x40207 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v11, v4, v11 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v13, v1, v13 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v12, v15, v12 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v10, v10 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v11, v11 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v13, v13 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v12, v12 | 
 | ; GFX10-NEXT:    v_mad_f32 v14, -v10, v1, v14 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10 | 
 | ; GFX10-NEXT:    v_mad_f32 v16, -v11, v3, v4 | 
 | ; GFX10-NEXT:    v_mad_f32 v17, -v13, v9, v1 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11 | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, v1 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13 | 
 | ; GFX10-NEXT:    v_mad_f32 v15, -v12, v4, v15 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v16|, v3 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v17|, v9 | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v4 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: udiv: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x40207 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11 | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v4, v4 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v11, v11 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v4 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12 | 
 | ; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, v9 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v12, v12 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v13, v9, v13 | 
 | ; GFX9-NEXT:    v_mad_f32 v15, -v12, v3, v10 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v13, v13 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14 | 
 | ; GFX9-NEXT:    v_mad_f32 v9, -v13, v10, v9 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v14, v14 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, v3 | 
 | ; GFX9-NEXT:    v_mad_f32 v16, -v14, v4, v2 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v12, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v9|, v10 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v16|, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v14, vcc | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2 | 
 | ; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v4 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4> | 
 |   %vecins = udiv <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: uitofp_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v9 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v9 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4 | 
 | ; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x5020104 | 
 | ; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v4, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: uitofp_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5020104 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v9 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9 | 
 | ; GFX9-NEXT:    v_perm_b32 v10, v9, v4, s4 | 
 | ; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v10, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5> | 
 |   %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float> | 
 |   store <4 x float> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: urem_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v2, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2 | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v15, v0 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4 | 
 | ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v2 | 
 | ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v2 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v10, v3, v10 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v11, v3, v11 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v12, v3, v12 | 
 | ; GFX10-NEXT:    v_mul_f32_e32 v13, v15, v13 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v10, v10 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v11, v11 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v12, v12 | 
 | ; GFX10-NEXT:    v_trunc_f32_e32 v13, v13 | 
 | ; GFX10-NEXT:    v_mad_f32 v18, -v10, v1, v3 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10 | 
 | ; GFX10-NEXT:    v_mad_f32 v19, -v11, v3, v3 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11 | 
 | ; GFX10-NEXT:    v_mad_f32 v20, -v12, v4, v3 | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12 | 
 | ; GFX10-NEXT:    v_mad_f32 v15, -v13, v9, v15 | 
 | ; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, v3 | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v16 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v16, v1 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo | 
 | ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v14 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_mul_lo_u32 v9, v9, v17 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v16, v4 | 
 | ; GFX10-NEXT:    v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x2050505 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v1, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: urem_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[2:3], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2050505 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v3 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v4 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v11 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15 | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v14, v4 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16 | 
 | ; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v16, v16 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v2 | 
 | ; GFX9-NEXT:    v_mad_f32 v2, -v16, v3, v3 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v16 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v13, v9 | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v18 | 
 | ; GFX9-NEXT:    v_mad_f32 v19, -v17, v11, v3 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v17 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc | 
 | ; GFX9-NEXT:    v_trunc_f32_e32 v18, v18 | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3 | 
 | ; GFX9-NEXT:    v_mad_f32 v13, -v18, v14, v13 | 
 | ; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v18 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v16, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v11 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v17, vcc | 
 | ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, v14 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4 | 
 | ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v18, vcc | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v4 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v10 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v0, v3, v0 | 
 | ; GFX9-NEXT:    v_mul_lo_u32 v3, v11, v12 | 
 | ; GFX9-NEXT:    v_sub_u32_e32 v4, v10, v4 | 
 | ; GFX9-NEXT:    v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_sub_u32_e32 v0, v10, v0 | 
 | ; GFX9-NEXT:    v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2> | 
 |   %vecins = urem <4 x i8> %shuffle0_0, %vec1 | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: xor_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffffff00 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v1, 1 | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v2, 2 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9 | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x200, v0 | 
 | ; GFX10-NEXT:    v_xor_b32_e32 v3, 0x100, v3 | 
 | ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0 | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5060307 | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[7:8], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: xor_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_movk_i32 s4, 0xff00 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v0, 1 | 
 | ; GFX9-NEXT:    v_mov_b32_e32 v1, 2 | 
 | ; GFX9-NEXT:    s_mov_b32 s5, 0x5060307 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9 | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_xor_b32_e32 v2, 0x200, v2 | 
 | ; GFX9-NEXT:    v_xor_b32_e32 v3, 0x100, v3 | 
 | ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2 | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s5 | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[7:8], v4, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5> | 
 |   %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> | 
 |   store <4 x i8> %vecins, ptr addrspace(1) %out0 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: zext_store_div: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo | 
 | ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4 | 
 | ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo | 
 | ; GFX10-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX10-NEXT:    v_mov_b32_e32 v0, 0xff | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v9 | 
 | ; GFX10-NEXT:    v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x5040100 | 
 | ; GFX10-NEXT:    v_perm_b32 v2, v4, v9, 0x60504 | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v3, v10, 0x5040100 | 
 | ; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off | 
 | ; GFX10-NEXT:    global_store_dword v[5:6], v2, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: zext_store_div: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4 | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | 
 | ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4 | 
 | ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | 
 | ; GFX9-NEXT:    global_load_dword v4, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v9, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x60504 | 
 | ; GFX9-NEXT:    s_movk_i32 s5, 0xff | 
 | ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v4 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v9 | 
 | ; GFX9-NEXT:    v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s6 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s6 | 
 | ; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off | 
 | ; GFX9-NEXT:    global_store_dword v[5:6], v2, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %tid = call i32 @llvm.amdgcn.workitem.id.x() | 
 |   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid | 
 |   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid | 
 |   %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 | 
 |   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4> | 
 |   %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16> | 
 |   store <4 x i16> %insvec, ptr addrspace(1) %out1 | 
 |   store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 | 
 |   ret void | 
 | } | 
 |  | 
 | define void @Source16Bit(i16 %in, <2 x i16> %reg) { | 
 | ; GFX10-LABEL: Source16Bit: | 
 | ; GFX10:       ; %bb.0: ; %entry | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3050204 | 
 | ; GFX10-NEXT:    global_store_dword v[0:1], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: Source16Bit: | 
 | ; GFX9:       ; %bb.0: ; %entry | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3050204 | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[0:1], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 | entry: | 
 |   %elt0 = extractelement <2 x i16> %reg, i32 1 | 
 |   %e0b0 = and i16 %elt0, 255 | 
 |   %e0b1 = and i16 %elt0, -256 | 
 |   %e1b0 = and i16 %in, 255 | 
 |   %e1b1 = and i16 %in, -256 | 
 |   %tmp0 = shl i16 %e0b0, 8 | 
 |   %byte0 = or i16 %tmp0, %e1b0 | 
 |   %tmp2 = lshr i16 %e1b1, 8 | 
 |   %byte1 = or i16 %e0b1, %tmp2 | 
 |   %ext0 = zext i16 %byte0 to i32 | 
 |   %ext1 = zext i16 %byte1 to i32 | 
 |   %shifted = shl i32 %ext1, 16 | 
 |   %result = or i32 %shifted, %ext0 | 
 |   store i32 %result, ptr addrspace(1) poison | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract3744: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract3744: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3070404 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <4 x i8> %vec1, i64 0 | 
 |   %zv1e0 = zext i8 %v1e0 to i32 | 
 |   %byte1 = shl i32 %zv1e0, 8 | 
 |  | 
 |   %v1e3 = extractelement <4 x i8> %vec1, i64 3 | 
 |   %zv1e3 = zext i8 %v1e3 to i32 | 
 |   %byte2 = shl i32 %zv1e3, 16 | 
 |   %v2e3 = extractelement <4 x i8> %vec2, i64 3 | 
 |   %zv2e3 = zext i8 %v2e3 to i32 | 
 |   %byte3 = shl i32 %zv2e3, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e0, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | declare i32 @llvm.amdgcn.perm(i32, i32, i32) | 
 |  | 
 | define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_perm_3744: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_perm_3744: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3070404 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %cast1 = bitcast <4 x i8> %vec1 to i32 | 
 |   %cast2 = bitcast <4 x i8> %vec2 to i32 | 
 |   %lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200) | 
 |   %hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164) | 
 |   %res = or i32 %hi8, %lo24 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract1347_v2i16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1030407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract1347_v2i16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1030407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %b0t0 = and i16 -256, %v2e1 | 
 |   %b0t1 = lshr i16 %b0t0, 8 | 
 |   %byte0 = zext i16 %b0t1 to i32 | 
 |  | 
 |   %b1t0 = and i16 255, %v2e0 | 
 |   %b1t1 = zext i16 %b1t0 to i32 | 
 |   %byte1 = shl i32 %b1t1, 8 | 
 |  | 
 |   %b2t0 = and i16 -256, %v1e1 | 
 |   %b2t1 = lshr i16 %b2t0, 8 | 
 |   %b2t2 = zext i16 %b2t1 to i32 | 
 |   %byte2 = shl i32 %b2t2, 16 | 
 |  | 
 |   %b3t0 = and i16 -256, %v1e0 | 
 |   %b3t1 = lshr i16 %b3t0, 8 | 
 |   %b3t2 = zext i16 %b3t1 to i32 | 
 |   %byte3 = shl i32 %b3t2, 24 | 
 |  | 
 |   %tmp0 = or i32 %byte0, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | declare i16 @llvm.fshr.i16(i16, i16, i16) | 
 |  | 
 | define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshri16_8: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshri16_8: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshri16_16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshri16_16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshri16_24: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshri16_24: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshri16_32: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshri16_32: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshri16_88: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshri16_88: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | declare i16 @llvm.fshl.i16(i16, i16, i16) | 
 |  | 
 | define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshli16_1347: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshli16_1347: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshli16_16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshli16_16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1000504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshli16_24: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshli16_24: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshli16_32: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshli16_32: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1000504 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: fshli16_88: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: fshli16_88: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[2:3], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x30407 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <2 x i16> %vec1, i64 0 | 
 |   %v1e1 = extractelement <2 x i16> %vec1, i64 1 | 
 |   %v2e0 = extractelement <2 x i16> %vec2, i64 0 | 
 |   %v2e1 = extractelement <2 x i16> %vec2, i64 1 | 
 |  | 
 |   %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88) | 
 |   %byte01 = zext i16 %tmp01.0 to i32 | 
 |  | 
 |   %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88) | 
 |   %tmp23.1 = zext i16 %tmp23.0 to i32 | 
 |   %byte23 = shl i32 %tmp23.1, 16 | 
 |   %res = or i32 %byte01, %byte23 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) { | 
 | ; GFX10-LABEL: shlbase: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v8, v[2:3], off | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 16, v6 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 24, v6 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v3, 8, v6 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v7 | 
 | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, v3, v2 | 
 | ; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: shlbase: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v8, v[2:3], off | 
 | ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v6 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v1, 16, v6 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v2, 24, v6 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v7 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 | 
 | ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, v0, v3 | 
 | ; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <4 x i8> %vec1, i64 0 | 
 |   %zv1e0 = zext i8 %v1e0 to i32 | 
 |   %b8 = add i32 %base, 8 | 
 |   %byte1 = shl i32 %zv1e0, %b8 | 
 |  | 
 |   %v1e3 = extractelement <4 x i8> %vec1, i64 3 | 
 |   %zv1e3 = zext i8 %v1e3 to i32 | 
 |   %b16 = add i32 %base, 16 | 
 |   %byte2 = shl i32 %zv1e3, %b16 | 
 |   %v2e3 = extractelement <4 x i8> %vec2, i64 3 | 
 |   %zv2e3 = zext i8 %v2e3 to i32 | 
 |   %b24 = add i32 %base, 24 | 
 |   %byte3 = shl i32 %zv2e3, %b24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e0, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | ; TODO -- lower into v_perm | 
 | define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) { | 
 | ; GFX10-LABEL: extractbase: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v8, v[2:3], off | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v6 | 
 | ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 24, v0 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_bfe_u32 v2, v7, v1, 8 | 
 | ; GFX10-NEXT:    v_bfe_u32 v0, v7, v0, 8 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2 | 
 | ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v0 | 
 | ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extractbase: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v8, v[2:3], off | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v6 | 
 | ; GFX9-NEXT:    v_add_u32_e32 v1, 24, v0 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_bfe_u32 v0, v7, v0, 8 | 
 | ; GFX9-NEXT:    v_bfe_u32 v2, v7, v1, 8 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2 | 
 | ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0 | 
 | ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1b = extractelement <4 x i8> %vec1, i64 %base | 
 |   %zv1b = zext i8 %v1b to i32 | 
 |   %byte1 = shl i32 %zv1b, 8 | 
 |  | 
 |   %b3 = add i64 %base, 3 | 
 |   %v1b3 = extractelement <4 x i8> %vec1, i64 %b3 | 
 |   %zv1b3 = zext i8 %v1b3 to i32 | 
 |   %byte2 = shl i32 %zv1b3, 16 | 
 |   %v2b3 = extractelement <4 x i8> %vec2, i64 %b3 | 
 |   %zv2b3 = zext i8 %v2b3 to i32 | 
 |   %byte3 = shl i32 %zv2b3, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1b, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_hilo: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[2:3], off | 
 | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3060505 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_hilo: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[2:3], off | 
 | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x3060505 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e5 = extractelement <8 x i8> %vec1, i64 5 | 
 |   %zv1e5 = zext i8 %v1e5 to i32 | 
 |   %byte1 = shl i32 %zv1e5, 8 | 
 |  | 
 |   %v1e6 = extractelement <8 x i8> %vec1, i64 6 | 
 |   %zv1e6 = zext i8 %v1e6 to i32 | 
 |   %byte2 = shl i32 %zv1e6, 16 | 
 |   %v2e3 = extractelement <8 x i8> %vec2, i64 3 | 
 |   %zv2e3 = zext i8 %v2e3 to i32 | 
 |   %byte3 = shl i32 %zv2e3, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e5, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_lohi: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4 | 
 | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x70404 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_lohi: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4 | 
 | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x70404 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <8 x i8> %vec1, i64 0 | 
 |   %zv1e0 = zext i8 %v1e0 to i32 | 
 |   %byte1 = shl i32 %zv1e0, 8 | 
 |  | 
 |   %v1e3 = extractelement <8 x i8> %vec1, i64 3 | 
 |   %zv1e3 = zext i8 %v1e3 to i32 | 
 |   %byte2 = shl i32 %zv1e3, 16 | 
 |   %v2e4 = extractelement <8 x i8> %vec2, i64 4 | 
 |   %zv2e4 = zext i8 %v2e4 to i32 | 
 |   %byte3 = shl i32 %zv2e4, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e0, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_hihi: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4 | 
 | ; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x2070505 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_hihi: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4 | 
 | ; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x2070505 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e5 = extractelement <8 x i8> %vec1, i64 5 | 
 |   %zv1e5 = zext i8 %v1e5 to i32 | 
 |   %byte1 = shl i32 %zv1e5, 8 | 
 |  | 
 |   %v1e7 = extractelement <8 x i8> %vec1, i64 7 | 
 |   %zv1e7 = zext i8 %v1e7 to i32 | 
 |   %byte2 = shl i32 %zv1e7, 16 | 
 |   %v2e6 = extractelement <8 x i8> %vec2, i64 6 | 
 |   %zv2e6 = zext i8 %v2e6 to i32 | 
 |   %byte3 = shl i32 %zv2e6, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e5, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_v8i8: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x1070404 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v8i8: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1070404 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %v1e4 = extractelement <8 x i8> %vec1, i64 4 | 
 |   %zv1e4 = zext i8 %v1e4 to i32 | 
 |   %byte1 = shl i32 %zv1e4, 8 | 
 |  | 
 |   %v1e7 = extractelement <8 x i8> %vec1, i64 7 | 
 |   %zv1e7 = zext i8 %v1e7 to i32 | 
 |   %byte2 = shl i32 %zv1e7, 16 | 
 |   %v2e1 = extractelement <8 x i8> %vec1, i64 1 | 
 |   %zv2e1 = zext i8 %v2e1 to i32 | 
 |   %byte3 = shl i32 %zv2e1, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e4, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_v256i8: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:252 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6050707 | 
 | ; GFX10-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v256i8: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:252 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x6050707 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[2:3], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %v1e4 = extractelement <256 x i8> %vec1, i64 255 | 
 |   %zv1e4 = zext i8 %v1e4 to i32 | 
 |   %byte1 = shl i32 %zv1e4, 8 | 
 |  | 
 |   %v1e7 = extractelement <256 x i8> %vec1, i64 253 | 
 |   %zv1e7 = zext i8 %v1e7 to i32 | 
 |   %byte2 = shl i32 %zv1e7, 16 | 
 |   %v2e1 = extractelement <256 x i8> %vec1, i64 254 | 
 |   %zv2e1 = zext i8 %v2e1 to i32 | 
 |   %byte3 = shl i32 %zv2e1, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e4, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | ; TODO : support this pattern | 
 | define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: extract_3src: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v7 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v8 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v6 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v0, 0xff0000, v0 | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff000000, v1 | 
 | ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 8, v2 | 
 | ; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_3src: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v6 | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v8 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff0000, v1 | 
 | ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff000000, v2 | 
 | ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0 | 
 | ; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 | 
 |   %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 | 
 |   %v1e0 = extractelement <8 x i8> %vec1, i64 0 | 
 |   %zv1e0 = zext i8 %v1e0 to i32 | 
 |   %byte1 = shl i32 %zv1e0, 8 | 
 |  | 
 |   %v1e5 = extractelement <8 x i8> %vec1, i64 5 | 
 |   %zv1e5 = zext i8 %v1e5 to i32 | 
 |   %byte2 = shl i32 %zv1e5, 16 | 
 |   %v2e6 = extractelement <8 x i8> %vec2, i64 6 | 
 |   %zv2e6 = zext i8 %v2e6 to i32 | 
 |   %byte3 = shl i32 %zv2e6, 24 | 
 |  | 
 |   %tmp0 = or i32 %zv1e0, %byte1 | 
 |   %tmp1 = or i32 %tmp0, %byte2 | 
 |   %res = or i32 %tmp1, %byte3 | 
 |   store i32 %res, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Should not result in crash | 
 | define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: extract_v6i16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_clause 0x3 | 
 | ; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6 | 
 | ; GFX10-NEXT:    global_load_ushort v3, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:2 | 
 | ; GFX10-NEXT:    global_load_ushort v9, v[0:1], off offset:4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_lshl_or_b32 v0, v8, 16, v3 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v9 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v6i16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off offset:4 | 
 | ; GFX9-NEXT:    global_load_ushort v3, v[0:1], off offset:6 | 
 | ; GFX9-NEXT:    global_load_ushort v8, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_ushort v9, v[0:1], off offset:2 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(2) | 
 | ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v2 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_lshl_or_b32 v1, v9, 16, v8 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v1, off | 
 | ; GFX9-NEXT:    global_store_dword v[6:7], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2 | 
 |   %el0 = extractelement <6 x i16> %vec, i32 0 | 
 |   %el1 = extractelement <6 x i16> %vec, i32 1 | 
 |   %el2 = extractelement <6 x i16> %vec, i32 2 | 
 |   %el3 = extractelement <6 x i16> %vec, i32 3 | 
 |   %z0 = zext i16 %el0 to i32 | 
 |   %z1 = zext i16 %el1 to i32 | 
 |   %s1 = shl nuw i32 %z1, 16 | 
 |   %o0 = or i32 %s1, %z0 | 
 |   %z2 = zext i16 %el2 to i32 | 
 |   %z3 = zext i16 %el3 to i32 | 
 |   %s3 = shl nuw i32 %z3, 16 | 
 |   %o1 = or i32 %z2, %s3 | 
 |  | 
 |   store i32 %o0, ptr addrspace(1) %out0, align 4 | 
 |   store i32 %o1, ptr addrspace(1) %out1, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | ; Should not result in crash | 
 | define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: extract_v7i16: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v7i16: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2 | 
 |   %el0 = extractelement <7 x i16> %vec, i32 0 | 
 |   %el1 = extractelement <7 x i16> %vec, i32 1 | 
 |   %el2 = extractelement <7 x i16> %vec, i32 2 | 
 |   %el3 = extractelement <7 x i16> %vec, i32 3 | 
 |   %z0 = zext i16 %el0 to i32 | 
 |   %z1 = zext i16 %el1 to i32 | 
 |   %s1 = shl nuw i32 %z1, 16 | 
 |   %o0 = or i32 %s1, %z0 | 
 |   %z2 = zext i16 %el2 to i32 | 
 |   %z3 = zext i16 %el3 to i32 | 
 |   %s3 = shl nuw i32 %z3, 16 | 
 |   %o1 = or i32 %z2, %s3 | 
 |  | 
 |   store i32 %o0, ptr addrspace(1) %out0, align 4 | 
 |   store i32 %o1, ptr addrspace(1) %out1, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Should not result in crash | 
 | define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: extract_v13i8: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_clause 0x1 | 
 | ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off | 
 | ; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:8 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_bfe_u32 v0, v2, 8, 8 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v8 | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040c00 | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040c03 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v13i8: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off | 
 | ; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040c00 | 
 | ; GFX9-NEXT:    s_mov_b32 s5, 0x5040c03 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_bfe_u32 v0, v2, 8, 8 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v8 | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4 | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s5 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2 | 
 |   %el0 = extractelement <13 x i8> %vec, i32 0 | 
 |   %el1 = extractelement <13 x i8> %vec, i32 1 | 
 |   %el2 = extractelement <13 x i8> %vec, i32 7 | 
 |   %el3 = extractelement <13 x i8> %vec, i32 8 | 
 |   %z0 = zext i8 %el0 to i32 | 
 |   %z1 = zext i8 %el1 to i32 | 
 |   %s1 = shl nuw i32 %z1, 16 | 
 |   %o0 = or i32 %s1, %z0 | 
 |   %z2 = zext i8 %el2 to i32 | 
 |   %z3 = zext i8 %el3 to i32 | 
 |   %s3 = shl nuw i32 %z3, 16 | 
 |   %o1 = or i32 %z2, %s3 | 
 |  | 
 |   store i32 %o0, ptr addrspace(1) %out0, align 4 | 
 |   store i32 %o1, ptr addrspace(1) %out1, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | ; Should not result in crash | 
 | define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { | 
 | ; GFX10-LABEL: extract_v13i64: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_clause 0x2 | 
 | ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48 | 
 | ; GFX10-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off | 
 | ; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64 | 
 | ; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX10-NEXT:    v_perm_b32 v0, v12, v13, 0x1000504 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    v_perm_b32 v1, v10, v14, 0x1000504 | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX10-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: extract_v13i64: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48 | 
 | ; GFX9-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off | 
 | ; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x1000504 | 
 | ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(1) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v12, v13, s4 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v1, v10, v14, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    global_store_dword v[6:7], v1, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2 | 
 |   %el0 = extractelement <13 x i64> %vec, i32 0 | 
 |   %el1 = extractelement <13 x i64> %vec, i32 1 | 
 |   %el2 = extractelement <13 x i64> %vec, i32 7 | 
 |   %el3 = extractelement <13 x i64> %vec, i32 8 | 
 |   %el00 = lshr i64 %el0, 32 | 
 |   %t0 = trunc i64 %el00 to i16 | 
 |   %z0 = zext i16 %t0 to i32 | 
 |   %z1 = trunc i64 %el1 to i32 | 
 |   %s1 = shl nuw i32 %z1, 16 | 
 |   %o0 = or i32 %s1, %z0 | 
 |   %t2 = trunc i64 %el2 to i16 | 
 |   %z2 = zext i16 %t2 to i32 | 
 |   %z3 = trunc i64 %el3 to i32 | 
 |   %s3 = shl nuw i32 %z3, 16 | 
 |   %o1 = or i32 %z2, %s3 | 
 |  | 
 |   store i32 %o0, ptr addrspace(1) %out0, align 4 | 
 |   store i32 %o1, ptr addrspace(1) %out1, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 |  | 
 | ; Should combine the lower 16 bits from each i32 in load | 
 | define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { | 
 | ; GFX10-LABEL: trunc_vector: | 
 | ; GFX10:       ; %bb.0: | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX10-NEXT:    s_clause 0x1 | 
 | ; GFX10-NEXT:    global_load_ushort v2, v[0:1], off | 
 | ; GFX10-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:4 | 
 | ; GFX10-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX10-NEXT:    global_store_dword v[4:5], v2, off | 
 | ; GFX10-NEXT:    s_setpc_b64 s[30:31] | 
 | ; | 
 | ; GFX9-LABEL: trunc_vector: | 
 | ; GFX9:       ; %bb.0: | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
 | ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off | 
 | ; GFX9-NEXT:    global_load_ushort v3, v[0:1], off offset:4 | 
 | ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100 | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4 | 
 | ; GFX9-NEXT:    global_store_dword v[4:5], v0, off | 
 | ; GFX9-NEXT:    s_waitcnt vmcnt(0) | 
 | ; GFX9-NEXT:    s_setpc_b64 s[30:31] | 
 |   %vec = load <2 x i32>, ptr addrspace(1) %in0, align 2 | 
 |   %tvec = trunc <2 x i32> %vec to <2 x i16> | 
 |   %el0 = extractelement <2 x i16> %tvec, i32 0 | 
 |   %el1 = extractelement <2 x i16> %tvec, i32 1 | 
 |   %z0 = zext i16 %el0 to i32 | 
 |   %z1 = zext i16 %el1 to i32 | 
 |   %s1 = shl nuw i32 %z1, 16 | 
 |   %o0 = or i32 %s1, %z0 | 
 |  | 
 |   store i32 %o0, ptr addrspace(1) %out0, align 4 | 
 |   ret void | 
 | } |