blob: d3851b1a084d68ffafe8beac768a895d9e6a7678 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)
declare i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 %dst_sel)
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_0(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_bf16_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_1(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_bf16_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_2(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_bf16_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_3(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_bf16_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_0(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f16_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_1(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f16_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_2(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f16_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_3(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f16_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_0(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f32_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_1(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f32_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_2(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f32_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_3(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_bf8_f32_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_0(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_bf16_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_1(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_bf16_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_2(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_bf16_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_3(ptr addrspace(1) %out, bfloat %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_bf16_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_0(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f16_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_1(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f16_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_2(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f16_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_3(ptr addrspace(1) %out, half %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f16_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_0(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f32_dst_sel_0:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 0)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_1(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f32_dst_sel_1:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 1)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_2(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f32_dst_sel_2:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 2)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_3(ptr addrspace(1) %out, float %src, i32 %seed, float %scale) {
; GFX950-LABEL: test_cvt_scalef32_sr_fp8_f32_dst_sel_3:
; GFX950: ; %bb.0:
; GFX950-NEXT: global_load_dword v5, v[0:1], off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,1]
; GFX950-NEXT: global_store_dword v[0:1], v5, off
; GFX950-NEXT: s_endpgm
%old = load i32, ptr addrspace(1) %out, align 4
%cvt = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 %old, float %src, i32 %seed, float %scale, i32 3)
store i32 %cvt, ptr addrspace(1) %out, align 4
ret void
}