blob: a21862c4a9395c5713e4cbabac8c2724800d0845 [file] [log] [blame]
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
// REQUIRES: amdgpu-registered-target
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned int uint;
typedef unsigned short int ushort;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.setprio.inc.wg(i16 10)
// CHECK-NEXT: ret void
//
void test_setprio_inc_wg() {
__builtin_amdgcn_s_setprio_inc_wg(10);
}
// CHECK-LABEL: @test_s_monitor_sleep(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.monitor.sleep(i16 10)
// CHECK-NEXT: ret void
//
void test_s_monitor_sleep() {
__builtin_amdgcn_s_monitor_sleep(10);
}
// CHECK-LABEL: @test_s_wait_asynccnt(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
// CHECK-NEXT: ret void
//
void test_s_wait_asynccnt() {
__builtin_amdgcn_s_wait_asynccnt(0);
}
// CHECK-LABEL: @test_s_wait_tensorcnt(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
// CHECK-NEXT: ret void
//
void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}
// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
// CHECK-NEXT: ret void
//
void test_prng_b32(global uint* out, uint a) {
*out = __builtin_amdgcn_prng_b32(a);
}
// CHECK-LABEL: @test_tanh_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
// CHECK-NEXT: ret void
//
void test_tanh_f32(global float* out, float a)
{
*out = __builtin_amdgcn_tanhf(a);
}
// CHECK-LABEL: @test_tanh_f16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
// CHECK-NEXT: ret void
//
void test_tanh_f16(global half* out, global half* a)
{
*out = __builtin_amdgcn_tanhh(*a);
}
// CHECK-LABEL: @test_tanh_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_tanh_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_tanh_bf16(a);
}
// CHECK-LABEL: @test_rcp_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rcp.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_rcp_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_rcp_bf16(a);
}
// CHECK-LABEL: @test_sqrt_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_sqrt_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_sqrt_bf16(a);
}
// CHECK-LABEL: @test_rsq_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_rsq_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_rsq_bf16(a);
}
// CHECK-LABEL: @test_log_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_log_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_log_bf16(a);
}
// CHECK-LABEL: @test_exp2_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_exp2_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_exp2_bf16(a);
}
// CHECK-LABEL: @test_sin_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_sin_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_sin_bf16(a);
}
// CHECK-LABEL: @test_cos_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_cos_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_cos_bf16(a);
}
// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP0]], i32 0)
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
// CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP3]], i32 1)
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
// CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP6]], i32 2)
// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
// CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP9]], i32 3)
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
// CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
// CHECK-NEXT: ret void
//
void test_cvt_f16_fp8(global half* out, int a)
{
out[0] = __builtin_amdgcn_cvt_f16_fp8(a, 0);
out[1] = __builtin_amdgcn_cvt_f16_fp8(a, 1);
out[2] = __builtin_amdgcn_cvt_f16_fp8(a, 2);
out[3] = __builtin_amdgcn_cvt_f16_fp8(a, 3);
}
// CHECK-LABEL: @test_cvt_f16_bf8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP0]], i32 0)
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
// CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP3]], i32 1)
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
// CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP6]], i32 2)
// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
// CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP9]], i32 3)
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
// CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
// CHECK-NEXT: ret void
//
void test_cvt_f16_bf8(global half* out, int a)
{
out[0] = __builtin_amdgcn_cvt_f16_bf8(a, 0);
out[1] = __builtin_amdgcn_cvt_f16_bf8(a, 1);
out[2] = __builtin_amdgcn_cvt_f16_bf8(a, 2);
out[3] = __builtin_amdgcn_cvt_f16_bf8(a, 3);
}
// CHECK-LABEL: @test_cvt_pk_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
// CHECK-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_pk_f16_fp8(global half2* out, short a)
{
out[0] = __builtin_amdgcn_cvt_pk_f16_fp8(a);
}
// CHECK-LABEL: @test_cvt_pk_f16_bf8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0
// CHECK-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_pk_f16_bf8(global half2* out, short a)
{
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
// CHECK-LABEL: @test_sat_pk4_i4_i8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2
// CHECK-NEXT: ret void
//
void test_sat_pk4_i4_i8(ushort *out, uint src)
{
*out = __builtin_amdgcn_sat_pk4_i4_i8(src);
*out = __builtin_amdgcn_sat_pk4_u4_u8(src);
}
// CHECK-LABEL: @test_permlane16_swap(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
// CHECK-NEXT: ret void
//
void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP0]], i32 0)
// CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP1]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP2]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP3]], i32 1)
// CHECK-NEXT: [[CONV1:%.*]] = fptosi float [[TMP4]] to i32
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[CONV1]], ptr addrspace(1) [[TMP5]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP6]], i32 2)
// CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[TMP7]] to i32
// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[CONV2]], ptr addrspace(1) [[TMP8]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP9]], i32 3)
// CHECK-NEXT: [[CONV3:%.*]] = fptosi float [[TMP10]] to i32
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[CONV3]], ptr addrspace(1) [[TMP11]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_f32_fp8_e5m3(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 0);
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 1);
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
}