| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py |
| // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s |
| // REQUIRES: amdgpu-registered-target |
| |
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable |
| |
| typedef unsigned int uint; |
| typedef unsigned short int ushort; |
| typedef unsigned int __attribute__((ext_vector_type(2))) uint2; |
| typedef half __attribute__((ext_vector_type(2))) half2; |
| |
| // CHECK-LABEL: @test_setprio_inc_wg( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: call void @llvm.amdgcn.s.setprio.inc.wg(i16 10) |
| // CHECK-NEXT: ret void |
| // |
| void test_setprio_inc_wg() { |
| __builtin_amdgcn_s_setprio_inc_wg(10); |
| } |
| |
| // CHECK-LABEL: @test_s_monitor_sleep( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: call void @llvm.amdgcn.s.monitor.sleep(i16 10) |
| // CHECK-NEXT: ret void |
| // |
| void test_s_monitor_sleep() { |
| __builtin_amdgcn_s_monitor_sleep(10); |
| } |
| |
| // CHECK-LABEL: @test_s_wait_asynccnt( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: call void @llvm.amdgcn.s.wait.asynccnt(i16 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_s_wait_asynccnt() { |
| __builtin_amdgcn_s_wait_asynccnt(0); |
| } |
| |
| // CHECK-LABEL: @test_s_wait_tensorcnt( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: call void @llvm.amdgcn.s.wait.tensorcnt(i16 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_s_wait_tensorcnt() { |
| __builtin_amdgcn_s_wait_tensorcnt(0); |
| } |
| |
| // CHECK-LABEL: @test_prng_b32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_prng_b32(global uint* out, uint a) { |
| *out = __builtin_amdgcn_prng_b32(a); |
| } |
| |
| // CHECK-LABEL: @test_tanh_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_tanh_f32(global float* out, float a) |
| { |
| *out = __builtin_amdgcn_tanhf(a); |
| } |
| |
| // CHECK-LABEL: @test_tanh_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2 |
| // CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]]) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_tanh_f16(global half* out, global half* a) |
| { |
| *out = __builtin_amdgcn_tanhh(*a); |
| } |
| |
| // CHECK-LABEL: @test_tanh_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_tanh_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_tanh_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_rcp_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rcp.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_rcp_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_rcp_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_sqrt_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_sqrt_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_sqrt_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_rsq_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_rsq_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_rsq_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_log_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_log_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_log_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_exp2_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_exp2_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_exp2_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_sin_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_sin_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_sin_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_cos_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_cos_bf16(global __bf16* out, __bf16 a) |
| { |
| *out = __builtin_amdgcn_cos_bf16(a); |
| } |
| |
| // CHECK-LABEL: @test_cvt_f16_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP0]], i32 0) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0 |
| // CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP3]], i32 1) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1 |
| // CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP6]], i32 2) |
| // CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2 |
| // CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP9]], i32 3) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3 |
| // CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_f16_fp8(global half* out, int a) |
| { |
| out[0] = __builtin_amdgcn_cvt_f16_fp8(a, 0); |
| out[1] = __builtin_amdgcn_cvt_f16_fp8(a, 1); |
| out[2] = __builtin_amdgcn_cvt_f16_fp8(a, 2); |
| out[3] = __builtin_amdgcn_cvt_f16_fp8(a, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_f16_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP0]], i32 0) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0 |
| // CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP3]], i32 1) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1 |
| // CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP6]], i32 2) |
| // CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2 |
| // CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.bf8(i32 [[TMP9]], i32 3) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3 |
| // CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_f16_bf8(global half* out, int a) |
| { |
| out[0] = __builtin_amdgcn_cvt_f16_bf8(a, 0); |
| out[1] = __builtin_amdgcn_cvt_f16_bf8(a, 1); |
| out[2] = __builtin_amdgcn_cvt_f16_bf8(a, 2); |
| out[3] = __builtin_amdgcn_cvt_f16_bf8(a, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_pk_f16_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0 |
| // CHECK-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_pk_f16_fp8(global half2* out, short a) |
| { |
| out[0] = __builtin_amdgcn_cvt_pk_f16_fp8(a); |
| } |
| |
| // CHECK-LABEL: @test_cvt_pk_f16_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i16 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR_ASCAST]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[TMP2]], i64 0 |
| // CHECK-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_pk_f16_bf8(global half2* out, short a) |
| { |
| out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); |
| } |
| |
| // CHECK-LABEL: @test_sat_pk4_i4_i8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]]) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_sat_pk4_i4_i8(ushort *out, uint src) |
| { |
| *out = __builtin_amdgcn_sat_pk4_i4_i8(src); |
| *out = __builtin_amdgcn_sat_pk4_u4_u8(src); |
| } |
| |
| // CHECK-LABEL: @test_permlane16_swap( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr |
| // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) |
| // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 |
| // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 |
| // CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 |
| // CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 |
| // CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 |
| // CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 |
| // CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 |
| // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) |
| // CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 |
| // CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 |
| // CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 |
| // CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_permlane16_swap(global uint2* out, uint old, uint src) { |
| *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); |
| *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); |
| *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); |
| } |
| |
| // CHECK-LABEL: @test_cvt_f32_fp8_e5m3( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP0]], i32 0) |
| // CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP1]] to i32 |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP2]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP3]], i32 1) |
| // CHECK-NEXT: [[CONV1:%.*]] = fptosi float [[TMP4]] to i32 |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[CONV1]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP6]], i32 2) |
| // CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[TMP7]] to i32 |
| // CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[CONV2]], ptr addrspace(1) [[TMP8]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP9]], i32 3) |
| // CHECK-NEXT: [[CONV3:%.*]] = fptosi float [[TMP10]] to i32 |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[CONV3]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_f32_fp8_e5m3(global int* out, int a) |
| { |
| *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 0); |
| *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 1); |
| *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2); |
| *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3); |
| } |