; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx908  -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9

define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle6766:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6060706
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle6766:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x6060706
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle3744:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v6, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x307
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle3744:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v6, v[0:1], off
; GFX9-NEXT:    global_load_dword v7, v[2:3], off
; GFX9-NEXT:    s_movk_i32 s4, 0x307
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle4445:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040404
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle4445:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5040404
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle0101:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle0101:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle1004:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v6, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x40405
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle1004:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v6, v[0:1], off
; GFX9-NEXT:    global_load_dword v7, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x40405
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}



define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
; GFX10-LABEL: shuffle7533:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    flat_load_dword v6, v[0:1]
; GFX10-NEXT:    flat_load_dword v7, v[2:3]
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3030507
; GFX10-NEXT:    flat_store_dword v[4:5], v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle7533:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    flat_load_dword v6, v[0:1]
; GFX9-NEXT:    flat_load_dword v7, v[2:3]
; GFX9-NEXT:    s_mov_b32 s4, 0x3030507
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT:    flat_store_dword v[4:5], v0
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
  store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
  ret void
}

define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
; GFX10-LABEL: shuffle7767:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    flat_load_dword v0, v[2:3]
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060707
; GFX10-NEXT:    flat_store_dword v[4:5], v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle7767:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    flat_load_dword v0, v[2:3]
; GFX9-NEXT:    s_mov_b32 s4, 0x7060707
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    flat_store_dword v[4:5], v0
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
  store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
  ret void
}

define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
; GFX10-LABEL: shuffle0554:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    ds_read_b32 v0, v0
; GFX10-NEXT:    ds_read_b32 v1, v1
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x10104
; GFX10-NEXT:    ds_write_b32 v2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle0554:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ds_read_b32 v0, v0
; GFX9-NEXT:    ds_read_b32 v1, v1
; GFX9-NEXT:    s_mov_b32 s4, 0x10104
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT:    ds_write_b32 v2, v0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4>
  store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
  ret void
}

define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
; GFX10-LABEL: shuffle2127:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    ds_read_b32 v0, v0
; GFX10-NEXT:    ds_read_b32 v1, v1
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3060506
; GFX10-NEXT:    ds_write_b32 v2, v0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle2127:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ds_read_b32 v0, v0
; GFX9-NEXT:    ds_read_b32 v1, v1
; GFX9-NEXT:    s_mov_b32 s4, 0x3060506
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT:    ds_write_b32 v2, v0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7>
  store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
  ret void
}

define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) {
; GFX10-LABEL: shuffle5047:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
; GFX10-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v4, v3, 0x7040005
; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle5047:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
; GFX9-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
; GFX9-NEXT:    s_mov_b32 s4, 0x7040005
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v4, v3, s4
; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7>
  store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4
  ret void
}

define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle3546:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v6, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2000107
; GFX10-NEXT:    global_store_dword v[4:5], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle3546:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v6, v[0:1], off
; GFX9-NEXT:    global_load_dword v7, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x2000107
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
; GFX9-NEXT:    global_store_dword v[4:5], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}


define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle7330ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x4070706
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle7330ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x4070706
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle5341ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040706
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle5341ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5040706
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle6106ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle6106ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}


define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle4327ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle4327ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle3263ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060607
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle3263ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x7060607
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle2763ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle2763ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle1327ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060705
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle1327ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x7060705
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
; GFX10-LABEL: shuffle0605ud2:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
; GFX10-NEXT:    global_store_dword v[2:3], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle0605ud2:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT:    global_store_dword v[2:3], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
  ret void
}

define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
; GFX10-LABEL: insertUsesOr:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: insertUsesOr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  ret void
}

define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
; GFX10-LABEL: addUsesOr:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
; GFX10-NEXT:    v_add_nc_u16 v2, v2, v3
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_add_nc_u16 v1, v4, v1
; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: addUsesOr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v7, v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3>
  %added = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %added, ptr addrspace(1) %out0
  ret void
}


define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 {
; GFX10-LABEL: shuffle8i8:
; GFX10:       ; %bb.0: ; %bb
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_lshr_b32 s1, s1, 8
; GFX10-NEXT:    s_lshr_b32 s4, s9, 16
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, s9
; GFX10-NEXT:    v_and_b32_e64 v1, 0xffffff00, s8
; GFX10-NEXT:    v_lshlrev_b16 v2, 8, s4
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, s8
; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
; GFX10-NEXT:    v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
; GFX10-NEXT:    s_endpgm
;
; GFX9-LABEL: shuffle8i8:
; GFX9:       ; %bb.0: ; %bb
; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffffff00
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
; GFX9-NEXT:    v_lshlrev_b16_e64 v1, 8, s9
; GFX9-NEXT:    v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    s_lshr_b32 s1, s9, 16
; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 8, s8
; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
; GFX9-NEXT:    v_lshlrev_b16_e64 v1, 8, s1
; GFX9-NEXT:    v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT:    s_endpgm
bb:
  %vec0 = load <8 x i8>, ptr addrspace(1) %in0
  %vec1 = load <8 x i8>, ptr addrspace(1) %in1
  %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9>
  store <8 x i8> %shuffle0, ptr addrspace(1) %out1
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone

; Not combined to perm due to non-vectorized use, non-divergent
define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
; GFX10-LABEL: add:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4
; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
; GFX10-NEXT:    v_add_nc_u16 v2, v7, v2
; GFX10-NEXT:    v_add_nc_u16 v3, v3, v7
; GFX10-NEXT:    v_add_nc_u16 v1, v1, v4
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v7, v[0:1], off
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT:    v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  ret void
}

; Not combined to perm due to non-vectorized use
define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
; GFX10-LABEL: add_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v7, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_add_nc_u16 v1, v1, v7
; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v7, v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  ret void
}

; Not combined to perm due to non-divergent use
define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: add_store:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4
; GFX10-NEXT:    v_add_nc_u16 v1, v0, v1
; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_store:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_u16_sdwa v1, v1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT:    v_add_u16_e32 v3, v0, v9
; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT:    global_store_dword v[5:6], v1, off
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

; Not combined to perm due to 16 bit or
define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: add_store_div_16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4
; GFX10-NEXT:    v_add_nc_u16 v1, v0, v1
; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_store_div_16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v9
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 24, v9
; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_add_u16_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

; Vectorized use, divergent, 32 bit or
define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: add_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v9
; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4
; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
; GFX10-NEXT:    v_add_nc_u16 v2, v9, v2
; GFX10-NEXT:    v_add_nc_u16 v3, v3, v9
; GFX10-NEXT:    v_add_nc_u16 v1, v1, v10
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x10705
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x10705
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
; GFX9-NEXT:    v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT:    v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
  %vecins = add <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: and_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 2
; GFX10-NEXT:    v_mov_b32_e32 v1, 1
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x102
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_and_b32_e32 v2, 0x100, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x5070006
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: and_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v0, 2
; GFX9-NEXT:    v_mov_b32_e32 v1, 1
; GFX9-NEXT:    s_movk_i32 s5, 0x102
; GFX9-NEXT:    s_mov_b32 s4, 0x5070006
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_and_b32_e32 v2, 0x100, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
; GFX9-NEXT:    v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_perm_b32 v3, v4, v9, s4
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v3, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1>
  %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: ashr_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 24, v9
; GFX10-NEXT:    v_bfe_i32 v2, v9, 0, 8
; GFX10-NEXT:    v_lshlrev_b16 v3, 6, v1
; GFX10-NEXT:    v_lshlrev_b16 v2, 7, v2
; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_ashrrev_i16 v4, 10, v0
; GFX10-NEXT:    v_perm_b32 v0, v9, v0, 0x4010707
; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v3
; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff00, v2
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: ashr_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x4010707
; GFX9-NEXT:    v_mov_b32_e32 v0, 7
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 24, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 6, v1
; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 10, v9
; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff00, v0
; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff00, v4
; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v2, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0>
  %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: bc_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x7060104
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bc_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x7060104
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
  %insvec = bitcast <4 x i8> %shuffle0_0 to i32
  store i32 %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}


define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
; GFX10-LABEL: eve_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v5, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v1, v5, v4, 0x1020305
; GFX10-NEXT:    global_store_byte v[9:10], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: eve_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v5, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x1020305
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
; GFX9-NEXT:    global_store_byte v[9:10], v1, off
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1>
  %tmp = extractelement <4 x i8> %shuffle0_0, i32 1
  store i8 %tmp, ptr addrspace(1) %out2
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

; Not combined to perm due to multi use of or operands (introduced by insert op)
define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: ive_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v9
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v9
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    global_load_dword v10, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 16
; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v4
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    v_alignbit_b32 v0, v0, v10, 16
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: ive_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v9
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    global_load_dword v10, v[2:3], off
; GFX9-NEXT:    s_movk_i32 s4, 0xff
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX9-NEXT:    v_alignbit_b32 v2, v1, v10, 16
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v2, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2>
  %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: lhsr_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 26
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 25, v9
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 26, v4
; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f00, v1
; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x1030707
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: lhsr_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v0, 26
; GFX9-NEXT:    s_mov_b32 s4, 0x1030707
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v3, 1, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 25, v9
; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 26, v4
; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f00, v3
; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
  %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: mul_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v9
; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v9
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v2
; GFX10-NEXT:    v_mul_lo_u16 v1, v3, v1
; GFX10-NEXT:    v_mul_lo_u16 v2, v4, v9
; GFX10-NEXT:    v_mul_lo_u16 v3, v9, v3
; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2000504
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: mul_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x2000504
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v9, v4
; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v4, v0
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6>
  %vecins = mul <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: or_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 16
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x102
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4
; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_e32 v1, 0x201, v1
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x2010005
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: or_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v2, v[2:3], off
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x2010005
; GFX9-NEXT:    s_movk_i32 s5, 0x102
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v4, v0, v2, s4
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_e32 v0, 0x201, v0
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v4, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
  %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}

define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: sdiv_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_bfe_i32 v0, v4, 0, 8
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 24, v9
; GFX10-NEXT:    v_bfe_i32 v3, v4, 8, 8
; GFX10-NEXT:    v_bfe_i32 v1, v9, 16, 8
; GFX10-NEXT:    v_bfe_i32 v10, v4, 16, 8
; GFX10-NEXT:    v_cvt_f32_i32_e32 v13, v0
; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 24, v4
; GFX10-NEXT:    v_xor_b32_e32 v15, v2, v3
; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
; GFX10-NEXT:    v_xor_b32_e32 v12, v1, v0
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v13
; GFX10-NEXT:    v_cvt_f32_i32_e32 v14, v1
; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v10
; GFX10-NEXT:    v_cvt_f32_i32_e32 v10, v10
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v3
; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v11
; GFX10-NEXT:    v_cvt_f32_i32_e32 v11, v11
; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v10
; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
; GFX10-NEXT:    v_mul_f32_e32 v16, v14, v16
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v11
; GFX10-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
; GFX10-NEXT:    v_mul_f32_e32 v17, v2, v17
; GFX10-NEXT:    v_trunc_f32_e32 v16, v16
; GFX10-NEXT:    v_or_b32_e32 v12, 1, v12
; GFX10-NEXT:    v_or_b32_e32 v15, 1, v15
; GFX10-NEXT:    v_mul_f32_e32 v18, v14, v18
; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
; GFX10-NEXT:    v_mad_f32 v20, -v16, v13, v14
; GFX10-NEXT:    v_mul_f32_e32 v19, v13, v19
; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
; GFX10-NEXT:    v_mad_f32 v2, -v17, v3, v2
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13|
; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1
; GFX10-NEXT:    v_mad_f32 v14, -v18, v10, v14
; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3|
; GFX10-NEXT:    v_mad_f32 v21, -v19, v11, v13
; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16
; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v15, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10|
; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
; GFX10-NEXT:    v_add_nc_u32_e32 v3, v16, v12
; GFX10-NEXT:    v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11|
; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_add_nc_u32_e32 v1, v18, v1
; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
; GFX10-NEXT:    v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x60706
; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sdiv_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x60706
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_bfe_i32 v1, v4, 0, 8
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
; GFX9-NEXT:    v_bfe_i32 v2, v9, 16, 8
; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 24, v9
; GFX9-NEXT:    v_bfe_i32 v9, v4, 8, 8
; GFX9-NEXT:    v_cvt_f32_i32_e32 v12, v1
; GFX9-NEXT:    v_bfe_i32 v10, v4, 16, 8
; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 24, v4
; GFX9-NEXT:    v_xor_b32_e32 v14, v3, v9
; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v9
; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v1
; GFX9-NEXT:    v_cvt_f32_i32_e32 v13, v2
; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v10
; GFX9-NEXT:    v_cvt_f32_i32_e32 v10, v10
; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v4
; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v12
; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v9
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v10
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v4
; GFX9-NEXT:    v_mul_f32_e32 v15, v13, v15
; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
; GFX9-NEXT:    v_mul_f32_e32 v17, v13, v17
; GFX9-NEXT:    v_mul_f32_e32 v18, v12, v18
; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
; GFX9-NEXT:    v_mad_f32 v19, -v15, v12, v13
; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
; GFX9-NEXT:    v_or_b32_e32 v11, 1, v11
; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
; GFX9-NEXT:    v_mad_f32 v3, -v16, v9, v3
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v12|
; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14
; GFX9-NEXT:    v_cvt_i32_f32_e32 v15, v15
; GFX9-NEXT:    v_cvt_i32_f32_e32 v16, v16
; GFX9-NEXT:    v_mad_f32 v13, -v17, v10, v13
; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17
; GFX9-NEXT:    v_mad_f32 v20, -v18, v4, v12
; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18
; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v9|
; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v10|
; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, |v4|
; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT:    v_add_u32_e32 v4, v15, v11
; GFX9-NEXT:    v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_add_u32_e32 v2, v17, v2
; GFX9-NEXT:    v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v1, off
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4>
  %vecins = sdiv <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: sext_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
; GFX10-NEXT:    v_ashrrev_i16 v2, 8, v4
; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT:    v_ashrrev_i16 v3, 8, v1
; GFX10-NEXT:    v_perm_b32 v1, v0, v2, 0x5040100
; GFX10-NEXT:    v_perm_b32 v0, v3, v3, 0x5040100
; GFX10-NEXT:    v_perm_b32 v2, v9, v4, 0x3010707
; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
; GFX10-NEXT:    global_store_dword v[5:6], v2, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sext_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
; GFX9-NEXT:    s_mov_b32 s4, 0x3010707
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 8, v9
; GFX9-NEXT:    v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s5
; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s5
; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
; GFX9-NEXT:    global_store_dword v[5:6], v2, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
  %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16>
  store <4 x i16> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}


define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: shl_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshlrev_b16 v0, 2, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v9
; GFX10-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v0
; GFX10-NEXT:    v_and_b32_e32 v3, 0xfe, v1
; GFX10-NEXT:    v_and_b32_e32 v1, 0xfffffe00, v1
; GFX10-NEXT:    v_and_b32_e32 v0, 0xfc, v0
; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5000104
; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shl_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5000104
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 2, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 1, v9
; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
; GFX9-NEXT:    v_and_b32_e32 v3, 0xfffffc00, v1
; GFX9-NEXT:    v_and_b32_e32 v4, 0xfe, v2
; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffe00, v2
; GFX9-NEXT:    v_and_b32_e32 v1, 0xfc, v1
; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v1, off
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5>
  %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: sitofp_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
; GFX10-NEXT:    v_ashrrev_i16 v2, 8, v9
; GFX10-NEXT:    v_ashrrev_i16 v3, 8, v4
; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x6010205
; GFX10-NEXT:    v_bfe_i32 v10, v0, 0, 8
; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
; GFX10-NEXT:    global_store_dword v[5:6], v4, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sitofp_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x6010205
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 8, v9
; GFX9-NEXT:    v_bfe_i32 v10, v0, 0, 8
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 8, v4
; GFX9-NEXT:    v_bfe_i32 v11, v2, 0, 8
; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX9-NEXT:    v_perm_b32 v4, v4, v9, s4
; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
; GFX9-NEXT:    global_store_dword v[5:6], v4, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6>
  %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float>
  store <4 x float> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}


define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: srem_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_bfe_i32 v1, v4, 0, 8
; GFX10-NEXT:    v_bfe_i32 v2, v4, 16, 8
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 24, v9
; GFX10-NEXT:    v_bfe_i32 v11, v4, 8, 8
; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 24, v4
; GFX10-NEXT:    v_bfe_i32 v13, v9, 16, 8
; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v1
; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
; GFX10-NEXT:    v_xor_b32_e32 v16, v10, v11
; GFX10-NEXT:    v_cvt_f32_i32_e32 v11, v11
; GFX10-NEXT:    v_cvt_f32_i32_e32 v15, v2
; GFX10-NEXT:    v_cvt_f32_i32_e32 v10, v10
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v1
; GFX10-NEXT:    v_cvt_f32_i32_e32 v17, v12
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v11
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15
; GFX10-NEXT:    v_xor_b32_e32 v2, v12, v2
; GFX10-NEXT:    v_xor_b32_e32 v12, v13, v12
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v21, v17
; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
; GFX10-NEXT:    v_cvt_f32_i32_e32 v13, v13
; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
; GFX10-NEXT:    v_mul_f32_e32 v18, v15, v18
; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
; GFX10-NEXT:    v_mul_f32_e32 v19, v10, v19
; GFX10-NEXT:    v_mul_f32_e32 v20, v17, v20
; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14
; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
; GFX10-NEXT:    v_mul_f32_e32 v21, v13, v21
; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
; GFX10-NEXT:    v_trunc_f32_e32 v20, v20
; GFX10-NEXT:    v_or_b32_e32 v16, 1, v16
; GFX10-NEXT:    v_mad_f32 v22, -v18, v1, v15
; GFX10-NEXT:    v_trunc_f32_e32 v21, v21
; GFX10-NEXT:    v_mad_f32 v10, -v19, v11, v10
; GFX10-NEXT:    v_mad_f32 v23, -v20, v15, v17
; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1|
; GFX10-NEXT:    v_or_b32_e32 v2, 1, v2
; GFX10-NEXT:    v_mad_f32 v13, -v21, v17, v13
; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
; GFX10-NEXT:    v_or_b32_e32 v12, 1, v12
; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v14, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11|
; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
; GFX10-NEXT:    v_cvt_i32_f32_e32 v20, v20
; GFX10-NEXT:    v_cvt_i32_f32_e32 v21, v21
; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v16, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15|
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
; GFX10-NEXT:    v_add_nc_u32_e32 v1, v18, v1
; GFX10-NEXT:    v_add_nc_u32_e32 v10, v19, v10
; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17|
; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4
; GFX10-NEXT:    v_mul_lo_u32 v3, v10, v3
; GFX10-NEXT:    v_add_nc_u32_e32 v2, v20, v2
; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc_lo
; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v0
; GFX10-NEXT:    v_add_nc_u32_e32 v11, v21, v11
; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX10-NEXT:    v_mul_lo_u32 v10, v11, v12
; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v2
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2070306
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: srem_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x2070306
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_bfe_i32 v2, v4, 0, 8
; GFX9-NEXT:    v_bfe_i32 v3, v4, 16, 8
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 24, v9
; GFX9-NEXT:    v_bfe_i32 v12, v4, 8, 8
; GFX9-NEXT:    v_xor_b32_e32 v16, v3, v2
; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 24, v4
; GFX9-NEXT:    v_xor_b32_e32 v18, v11, v12
; GFX9-NEXT:    v_cvt_f32_i32_e32 v12, v12
; GFX9-NEXT:    v_cvt_f32_i32_e32 v17, v3
; GFX9-NEXT:    v_cvt_f32_i32_e32 v19, v13
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v20, v2
; GFX9-NEXT:    v_bfe_i32 v15, v9, 16, 8
; GFX9-NEXT:    v_cvt_f32_i32_e32 v11, v11
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v21, v12
; GFX9-NEXT:    v_xor_b32_e32 v3, v13, v3
; GFX9-NEXT:    v_xor_b32_e32 v13, v15, v13
; GFX9-NEXT:    v_cvt_f32_i32_e32 v15, v15
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v22, v17
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v23, v19
; GFX9-NEXT:    v_mul_f32_e32 v20, v17, v20
; GFX9-NEXT:    v_mul_f32_e32 v21, v11, v21
; GFX9-NEXT:    v_trunc_f32_e32 v20, v20
; GFX9-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
; GFX9-NEXT:    v_mul_f32_e32 v22, v19, v22
; GFX9-NEXT:    v_mul_f32_e32 v23, v15, v23
; GFX9-NEXT:    v_trunc_f32_e32 v21, v21
; GFX9-NEXT:    v_mad_f32 v24, -v20, v2, v17
; GFX9-NEXT:    v_ashrrev_i32_e32 v18, 30, v18
; GFX9-NEXT:    v_or_b32_e32 v16, 1, v16
; GFX9-NEXT:    v_trunc_f32_e32 v22, v22
; GFX9-NEXT:    v_trunc_f32_e32 v23, v23
; GFX9-NEXT:    v_mad_f32 v11, -v21, v12, v11
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v24|, |v2|
; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
; GFX9-NEXT:    v_or_b32_e32 v18, 1, v18
; GFX9-NEXT:    v_cvt_i32_f32_e32 v20, v20
; GFX9-NEXT:    v_cvt_i32_f32_e32 v21, v21
; GFX9-NEXT:    v_mad_f32 v25, -v22, v17, v19
; GFX9-NEXT:    v_cvt_i32_f32_e32 v22, v22
; GFX9-NEXT:    v_mad_f32 v15, -v23, v19, v15
; GFX9-NEXT:    v_cvt_i32_f32_e32 v23, v23
; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v11|, |v12|
; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
; GFX9-NEXT:    v_or_b32_e32 v3, 1, v3
; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v18, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v25|, |v17|
; GFX9-NEXT:    v_or_b32_e32 v13, 1, v13
; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, |v19|
; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v13, vcc
; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 24, v4
; GFX9-NEXT:    v_add_u32_e32 v2, v20, v2
; GFX9-NEXT:    v_add_u32_e32 v11, v21, v11
; GFX9-NEXT:    v_add_u32_e32 v3, v22, v3
; GFX9-NEXT:    v_add_u32_e32 v12, v23, v12
; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
; GFX9-NEXT:    v_mul_lo_u32 v4, v11, v10
; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v14
; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX9-NEXT:    v_sub_u32_e32 v3, v14, v3
; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2>
  %vecins = srem <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: sub_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v2, v[2:3], off
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_sub_nc_u16 v3, v0, v3
; GFX10-NEXT:    v_sub_nc_u16 v9, v1, v4
; GFX10-NEXT:    v_sub_nc_u16 v10, v4, v2
; GFX10-NEXT:    v_sub_nc_u16 v1, v4, v1
; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x6070007
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT:    v_lshlrev_b16 v4, 8, v9
; GFX10-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sub_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    global_load_dword v2, v[2:3], off
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x6070007
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
; GFX9-NEXT:    v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v4, v2, v0, s4
; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT:    v_sub_u16_e32 v2, v3, v2
; GFX9-NEXT:    v_sub_u16_e32 v1, v3, v1
; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v4, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6>
  %vecins = sub <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
; GFX10-LABEL: sv_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v5, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x50705
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: sv_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v5, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x50705
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4>
  %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0>
  store <4 x i8> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: trunc_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT:    v_lshlrev_b16 v2, 2, v0
; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX10-NEXT:    v_lshlrev_b16 v1, 3, v4
; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x50205
; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
; GFX10-NEXT:    global_store_byte v[7:8], v0, off
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: trunc_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    s_mov_b32 s4, 0x50205
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 3, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 2, v2
; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
; GFX9-NEXT:    global_store_byte v[7:8], v0, off
; GFX9-NEXT:    global_store_dword v[5:6], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0>
  %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1>
  store <4 x i1> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}

define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: udiv:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v2, v[2:3], off
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v14, v0
; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x40207
; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10
; GFX10-NEXT:    v_mul_f32_e32 v11, v4, v11
; GFX10-NEXT:    v_mul_f32_e32 v13, v1, v13
; GFX10-NEXT:    v_mul_f32_e32 v12, v15, v12
; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
; GFX10-NEXT:    v_mad_f32 v14, -v10, v1, v14
; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
; GFX10-NEXT:    v_mad_f32 v16, -v11, v3, v4
; GFX10-NEXT:    v_mad_f32 v17, -v13, v9, v1
; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, v1
; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
; GFX10-NEXT:    v_mad_f32 v15, -v12, v4, v15
; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v16|, v3
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v17|, v9
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v4
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9
; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: udiv:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x40207
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2
; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9
; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10
; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11
; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v4, v4
; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v4
; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12
; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1
; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, v9
; GFX9-NEXT:    v_trunc_f32_e32 v12, v12
; GFX9-NEXT:    v_mul_f32_e32 v13, v9, v13
; GFX9-NEXT:    v_mad_f32 v15, -v12, v3, v10
; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14
; GFX9-NEXT:    v_mad_f32 v9, -v13, v10, v9
; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, v3
; GFX9-NEXT:    v_mad_f32 v16, -v14, v4, v2
; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v9|, v10
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v16|, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v1, off
; GFX9-NEXT:    global_store_dword v[7:8], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4>
  %vecins = udiv <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: uitofp_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[2:3], off
; GFX10-NEXT:    global_load_dword v9, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v9
; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v9
; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x5020104
; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
; GFX10-NEXT:    global_store_dword v[5:6], v4, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: uitofp_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x5020104
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v9
; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
; GFX9-NEXT:    v_perm_b32 v10, v9, v4, s4
; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
; GFX9-NEXT:    global_store_dword v[5:6], v10, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5>
  %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float>
  store <4 x float> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}


define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: urem_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    global_load_dword v2, v[2:3], off
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v15, v0
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
; GFX10-NEXT:    v_mul_f32_e32 v10, v3, v10
; GFX10-NEXT:    v_mul_f32_e32 v11, v3, v11
; GFX10-NEXT:    v_mul_f32_e32 v12, v3, v12
; GFX10-NEXT:    v_mul_f32_e32 v13, v15, v13
; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
; GFX10-NEXT:    v_mad_f32 v18, -v10, v1, v3
; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
; GFX10-NEXT:    v_mad_f32 v19, -v11, v3, v3
; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
; GFX10-NEXT:    v_mad_f32 v20, -v12, v4, v3
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
; GFX10-NEXT:    v_mad_f32 v15, -v13, v9, v15
; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v16
; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v16, v1
; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v14
; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_mul_lo_u32 v9, v9, v17
; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v16, v4
; GFX10-NEXT:    v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x2050505
; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[5:6], v1, off
; GFX10-NEXT:    global_store_dword v[7:8], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: urem_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    global_load_dword v9, v[0:1], off
; GFX9-NEXT:    s_mov_b32 s4, 0x2050505
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v3
; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v4
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v11
; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v14, v4
; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14
; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17
; GFX9-NEXT:    v_mad_f32 v20, -v16, v3, v3
; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v16
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v13, v9
; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v2
; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v18
; GFX9-NEXT:    v_mad_f32 v21, -v17, v11, v3
; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v17
; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v15, vcc
; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, v3
; GFX9-NEXT:    v_mad_f32 v13, -v18, v14, v13
; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v18
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v16, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v21|, v11
; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v17, vcc
; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, v14
; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v18, vcc
; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v10
; GFX9-NEXT:    v_mul_lo_u32 v0, v11, v0
; GFX9-NEXT:    v_mul_lo_u32 v4, v13, v12
; GFX9-NEXT:    v_sub_u32_e32 v2, v10, v2
; GFX9-NEXT:    v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT:    v_sub_u32_e32 v0, v10, v0
; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
  %vecins = urem <4 x i8> %shuffle0_0, %vec1
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: xor_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffffff00
; GFX10-NEXT:    v_mov_b32_e32 v1, 1
; GFX10-NEXT:    v_mov_b32_e32 v2, 2
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9
; GFX10-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX10-NEXT:    v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_xor_b32_e32 v0, 0x200, v0
; GFX10-NEXT:    v_xor_b32_e32 v3, 0x100, v3
; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5060307
; GFX10-NEXT:    global_store_dword v[5:6], v0, off
; GFX10-NEXT:    global_store_dword v[7:8], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: xor_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_movk_i32 s4, 0xff00
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    s_mov_b32 s5, 0x5060307
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9
; GFX9-NEXT:    v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX9-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_xor_b32_e32 v2, 0x200, v2
; GFX9-NEXT:    v_xor_b32_e32 v3, 0x100, v3
; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s5
; GFX9-NEXT:    global_store_dword v[5:6], v0, off
; GFX9-NEXT:    global_store_dword v[7:8], v4, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5>
  %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
  store <4 x i8> %vecins, ptr addrspace(1) %out0
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
  ret void
}


define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
; GFX10-LABEL: zext_store_div:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT:    global_load_dword v4, v[0:1], off
; GFX10-NEXT:    global_load_dword v9, v[2:3], off
; GFX10-NEXT:    v_mov_b32_e32 v0, 0xff
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4
; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v4
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v9
; GFX10-NEXT:    v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x5040100
; GFX10-NEXT:    v_perm_b32 v2, v4, v9, 0x60504
; GFX10-NEXT:    v_perm_b32 v1, v3, v10, 0x5040100
; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
; GFX10-NEXT:    global_store_dword v[5:6], v2, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: zext_store_div:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    global_load_dword v4, v[0:1], off
; GFX9-NEXT:    global_load_dword v9, v[2:3], off
; GFX9-NEXT:    s_mov_b32 s4, 0x60504
; GFX9-NEXT:    s_movk_i32 s5, 0xff
; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v4
; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v9
; GFX9-NEXT:    v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s6
; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s6
; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
; GFX9-NEXT:    global_store_dword v[5:6], v2, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
  %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
  store <4 x i16> %insvec, ptr addrspace(1) %out1
  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
  ret void
}
