| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py |
| // RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s |
| // REQUIRES: amdgpu-registered-target |
| |
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable |
| |
| typedef unsigned int uint; |
| typedef unsigned short ushort; |
| typedef unsigned int __attribute__((ext_vector_type(2))) uint2; |
| typedef unsigned int __attribute__((ext_vector_type(6))) uint6; |
| typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; |
| typedef half __attribute__((ext_vector_type(32))) half32; |
| typedef short __attribute__((ext_vector_type(2))) short2; |
| typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2; |
| typedef float __attribute__((ext_vector_type(16))) float16; |
| typedef half __attribute__((ext_vector_type(2))) half2; |
| typedef float __attribute__((ext_vector_type(2))) float2; |
| typedef float __attribute__((ext_vector_type(32))) float32; |
| |
| // CHECK-LABEL: @test_prng_b32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) |
| // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_prng_b32(global uint* out, uint a) { |
| *out = __builtin_amdgcn_prng_b32(a); |
| } |
| |
| // CHECK-LABEL: @test_permlane16_swap( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) |
| // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 |
| // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 |
| // CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 |
| // CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 |
| // CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 |
| // CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 |
| // CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 |
| // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) |
| // CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 |
| // CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 |
| // CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 |
| // CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_permlane16_swap(global uint2* out, uint old, uint src) { |
| *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); |
| *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); |
| *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); |
| } |
| |
| // CHECK-LABEL: @test_permlane32_swap( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) |
| // CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 |
| // CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 |
| // CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 |
| // CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 |
| // CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 |
| // CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 |
| // CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 |
| // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) |
| // CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 |
| // CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 |
| // CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 |
| // CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_permlane32_swap(global uint2* out, uint old, uint src) { |
| *out = __builtin_amdgcn_permlane32_swap(old, src, false, false); |
| *out = __builtin_amdgcn_permlane32_swap(old, src, true, false); |
| *out = __builtin_amdgcn_permlane32_swap(old, src, false, true); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64 |
| // CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]]) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32 |
| // CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]]) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]]) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32 |
| // CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]]) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32 |
| // CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]]) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32 |
| // CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]]) |
| // CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float16 src0f32, float16 src1f32, float scale) |
| { |
| *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale); |
| *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale); |
| *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale); |
| *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale); |
| *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale); |
| *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale); |
| } |
| |
| // CHECK-LABEL: @test_ashr_pk_i8_i32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) |
| // CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) { |
| *out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2); |
| } |
| |
| // CHECK-LABEL: @test_ashr_pk_u8_i32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) |
| // CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) { |
| *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2); |
| } |
| |
| // CHECK-LABEL: @builtins_amdgcn_dl_insts( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 |
| // CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 |
| // CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat> |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat> |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) { |
| *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); |
| } |
| |
| // CHECK-LABEL: @builtins_amdgcn_dl_dot2c( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 |
| // CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 |
| // CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) { |
| *out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_f16_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) |
| // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 |
| // CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) |
| // CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 |
| // CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 |
| // CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) |
| // CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 |
| // CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 |
| // CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) |
| // CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 |
| // CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 |
| // CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) |
| // CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_f16_fp8(global half2* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, true); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_f32_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP0]], float [[TMP1]], i32 0) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP4]], float [[TMP5]], i32 1) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP8]], float [[TMP9]], i32 2) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP12]], float [[TMP13]], i32 3) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_f32_fp8(global float* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_f16_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) |
| // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 |
| // CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) |
| // CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 |
| // CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 |
| // CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) |
| // CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 |
| // CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 |
| // CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) |
| // CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 |
| // CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 |
| // CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) |
| // CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_f16_bf8(global half2* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, false); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, true); |
| *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, true); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_f32_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP0]], float [[TMP1]], i32 0) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP4]], float [[TMP5]], i32 1) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP8]], float [[TMP9]], i32 2) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP12]], float [[TMP13]], i32 3) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, false); |
| } |
| |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f32_fp8(global float2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f32_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp8_f16(global short2* out, half2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp8_bf16(global short2* out, bfloat2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf8_f16(global short2* out, half2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf8_bf16(global short2* out, bfloat2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp4( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8 |
| // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f32_fp4(global float2* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP15]], float [[TMP16]], float [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 |
| // CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP22]], float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 3) |
| // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp4_f32(global unsigned int* out, float src0, float src1, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp4( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f16_fp4(global half2* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp4( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) |
| // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf16_fp4(global bfloat2* out, uint src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp6( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 128 |
| // CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 128 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src, scale); |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src, scale); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk32_f16_fpbf6( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 |
| // CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk32_f16_fpbf6(global half32 *out, uint6 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale); |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk32_bf16_fpbf6( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 |
| // CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(src, scale); |
| *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(src, scale); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f16_fp8(global half2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_f16_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_f16_bf8(global half2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf16_fp8(global bfloat2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_bf8( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) |
| // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, true); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, false); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2) |
| // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 |
| // CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3) |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2) |
| // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 |
| // CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3) |
| // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 |
| // CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) |
| // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_pk_fp4_f16(global unsigned *out, half2 src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP8]], <2 x bfloat> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP15]], <2 x bfloat> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 |
| // CHECK-NEXT: [[TMP23:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP22]], <2 x bfloat> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) |
| // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_pk_fp4_bf16(global unsigned *out, bfloat2 src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x float>, align 8, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x float> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP1]], <2 x float> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP8]], <2 x float> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP15]], <2 x float> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 |
| // CHECK-NEXT: [[TMP23:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP22]], <2 x float> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) |
| // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_pk32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) |
| // CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5) |
| // CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128 |
| // CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32 |
| // CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32 |
| // CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 |
| // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32 |
| // CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) |
| // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32 |
| // CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 |
| // CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) |
| // CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32 |
| // CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 |
| // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) |
| // CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 |
| // CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2) |
| { |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2); |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2); |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2); |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2); |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2); |
| *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_bf8_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_bf8_bf16(global unsigned *out, __bf16 src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_bf8_f16(global unsigned *out, half src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_bf8_f32(global unsigned *out, float src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_fp8_bf16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_fp8_bf16(global unsigned *out, __bf16 src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_fp8_f16(global unsigned *out, half src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) |
| // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 |
| // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 |
| // CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) |
| // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, float scale) |
| { |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 0); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 1); |
| *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 2); |
| } |
| |
| // CHECK-LABEL: @test_bitop3_b32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_bitop3_b32(global uint* out, uint a, uint b, uint c) |
| { |
| *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); |
| } |
| |
| // CHECK-LABEL: @test_bitop3_b16( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) |
| // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5) |
| // CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2 |
| // CHECK-NEXT: store i16 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 2 |
| // CHECK-NEXT: store i16 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2 |
| // CHECK-NEXT: ret void |
| // |
| void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) |
| { |
| *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1); |
| } |
| |
| // CHECK-LABEL: @test_cvt_sr_bf16_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_sr_bf16_f32(global bfloat2 *out, float src, uint seed) |
| { |
| *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 0); |
| *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 1); |
| } |
| |
| // CHECK-LABEL: @test_cvt_sr_f16_f32( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) |
| // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) |
| // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 |
| // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) |
| { |
| *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 0); |
| *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1); |
| } |
| |
| // CHECK-LABEL: @test_load_to_lds_96( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_load_to_lds_96(global void* src, local void *dst) { |
| __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); |
| } |
| |
| // CHECK-LABEL: @test_load_to_lds_128( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_load_to_lds_128(global void* src, local void *dst) { |
| __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); |
| } |
| |
| // CHECK-LABEL: @test_global_load_lds_96( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_global_load_lds_96(global void* src, local void *dst) { |
| __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); |
| } |
| |
| // CHECK-LABEL: @test_global_load_lds_128( |
| // CHECK-NEXT: entry: |
| // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) |
| // CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) |
| // CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 |
| // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 |
| // CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) |
| // CHECK-NEXT: ret void |
| // |
| void test_global_load_lds_128(global void* src, local void *dst) { |
| __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); |
| } |