blob: 4158b4633c3d95b79f3f81b5347d29aef19610e2 [file] [edit]
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
#define __device__ __attribute__((device))
typedef _Float16 v2h __attribute__((ext_vector_type(2)));
typedef _Float16 v32h __attribute__((ext_vector_type(32)));
typedef unsigned int v6ui __attribute__((ext_vector_type(6)));
typedef short v2s __attribute__((ext_vector_type(2)));
// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_fp6_f16_hipPDv6_jDv32_DF16_f(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP0]], float [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr [[TMP3]], align 32
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk32_fp6_f16_hip(v6ui* out, v32h a, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(a, scale);
}
// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_bf6_f16_hipPDv6_jDv32_DF16_f(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP0]], float [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr [[TMP3]], align 32
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk32_bf6_f16_hip(v6ui* out, v32h a, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(a, scale);
}
// CHECK-LABEL: define dso_local void @_Z29test_cvt_scalef32_f16_fp8_hipPDv2_DF16_S_if(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], i32 noundef [[SRC32:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SRC32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SRC32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC32_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC32]], ptr [[SRC32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]], i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_f16_fp8_hip(v2h* out, v2h src, int src32, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_f16_fp8(src, src32, scale, 0, false);
}
// CHECK-LABEL: define dso_local void @_Z29test_cvt_scalef32_f16_bf8_hipPDv2_DF16_S_if(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], i32 noundef [[SRC32:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SRC32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SRC32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC32_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC32]], ptr [[SRC32_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC32_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]], i32 0, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_f16_bf8_hip(v2h* out, v2h src, int src32, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_f16_bf8(src, src32, scale, 0, false);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_fp8_f16_hipPDv2_sS_Dv2_DF16_f(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x i16> noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i16> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_fp8_f16_hip(v2s* out, v2s src, v2h a, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(src, a, scale, false);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_bf8_f16_hipPDv2_sS_Dv2_DF16_f(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x i16> noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i16> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_bf8_f16_hip(v2s* out, v2s src, v2h a, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(src, a, scale, false);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_fp4_hipPDv2_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0)
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_f16_fp4_hip(v2h* out, unsigned int src, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0);
}
// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_f16_fp6_hipPDv32_DF16_Dv6_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <6 x i32> noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 32
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call contract <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[TMP2]], ptr [[TMP3]], align 64
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk32_f16_fp6_hip(v32h* out, v6ui src, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale);
}
// CHECK-LABEL: define dso_local void @_Z34test_cvt_scalef32_pk32_f16_bf6_hipPDv32_DF16_Dv6_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <6 x i32> noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 32
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr [[SRC_ADDR_ASCAST]], align 32
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call contract <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP0]], float [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[TMP2]], ptr [[TMP3]], align 64
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk32_f16_bf6_hip(v32h* out, v6ui src, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_fp8_hipPDv2_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_f16_fp8_hip(v2h* out, unsigned int src, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, false);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_f16_bf8_hipPDv2_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_f16_bf8_hip(v2h* out, unsigned int src, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, false);
}
// CHECK-LABEL: define dso_local void @_Z32test_cvt_scalef32_pk_fp4_f16_hipPjjDv2_DF16_f(
// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i32 0)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_pk_fp4_f16_hip(unsigned int* out, unsigned int src, v2h a, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(src, a, scale, 0);
}
// CHECK-LABEL: define dso_local void @_Z35test_cvt_scalef32_sr_pk_fp4_f16_hipPjjDv2_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[SRC:%.*]], <2 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store <2 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP0]], <2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0)
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_sr_pk_fp4_f16_hip(unsigned int* out, unsigned int src, v2h a, unsigned int seed, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(src, a, seed, scale, 0);
}
// CHECK-LABEL: define dso_local void @_Z37test_cvt_scalef32_sr_pk32_bf6_f16_hipPDv6_jDv32_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_sr_pk32_bf6_f16_hip(v6ui* out, v32h a, unsigned int seed, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(a, seed, scale);
}
// CHECK-LABEL: define dso_local void @_Z37test_cvt_scalef32_sr_pk32_fp6_f16_hipPDv6_jDv32_DF16_jf(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <32 x half> noundef [[A:%.*]], i32 noundef [[SEED:%.*]], float noundef [[SCALE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <32 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[SCALE]], ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR_ASCAST]], align 64
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-NEXT: ret void
//
__device__ void test_cvt_scalef32_sr_pk32_fp6_f16_hip(v6ui* out, v32h a, unsigned int seed, float scale) {
*out = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(a, seed, scale);
}
// CHECK-LABEL: define dso_local void @_Z23test_cvt_sr_f16_f32_hipPDv2_DF16_S_fj(
// CHECK-SAME: ptr noundef [[OUT:%.*]], <2 x half> noundef [[SRC:%.*]], float noundef [[A:%.*]], i32 noundef [[SEED:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[SEED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SEED_ADDR]] to ptr
// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SEED]], ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SEED_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x half> [[TMP3]], ptr [[TMP4]], align 4
// CHECK-NEXT: ret void
//
__device__ void test_cvt_sr_f16_f32_hip(v2h* out, v2h src, float a, unsigned int seed) {
*out = __builtin_amdgcn_cvt_sr_f16_f32(src, a, seed, false);
}