| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 |
| // REQUIRES: amdgpu-registered-target |
| // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx942 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX942 |
| |
| #define __device__ __attribute__((device)) |
| |
| typedef float v4f __attribute__((ext_vector_type(4))); |
| typedef float v16f __attribute__((ext_vector_type(16))); |
| typedef float v32f __attribute__((ext_vector_type(32))); |
| typedef _Float16 v4h __attribute__((ext_vector_type(4))); |
| typedef _Float16 v8h __attribute__((ext_vector_type(8))); |
| typedef _Float16 v16h __attribute__((ext_vector_type(16))); |
| |
| // CHECK-GFX942-LABEL: define dso_local void @_Z32test_smfmac_f32_16x16x32_f16_hipPDv4_fDv4_DF16_Dv8_DF16_S_i( |
| // CHECK-GFX942-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[IDX:%.*]]) #[[ATTR0:[0-9]+]] { |
| // CHECK-GFX942-NEXT: [[ENTRY:.*:]] |
| // CHECK-GFX942-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-GFX942-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) |
| // CHECK-GFX942-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) |
| // CHECK-GFX942-NEXT: [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) |
| // CHECK-GFX942-NEXT: [[IDX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-GFX942-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[IDX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IDX_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: store i32 [[IDX]], ptr [[IDX_ADDR_ASCAST]], align 4 |
| // CHECK-GFX942-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: [[TMP3:%.*]] = load i32, ptr [[IDX_ADDR_ASCAST]], align 4 |
| // CHECK-GFX942-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <4 x float> [[TMP2]], i32 [[TMP3]], i32 0, i32 0) |
| // CHECK-GFX942-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <4 x float> [[TMP4]], ptr [[TMP5]], align 16 |
| // CHECK-GFX942-NEXT: ret void |
| // |
| __device__ void test_smfmac_f32_16x16x32_f16_hip(v4f* out, v4h a, v8h b, v4f c, int idx) { |
| *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); |
| } |
| |
| // CHECK-GFX942-LABEL: define dso_local void @_Z32test_smfmac_f32_32x32x16_f16_hipPDv16_fDv4_DF16_Dv8_DF16_S_i( |
| // CHECK-GFX942-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <16 x float> noundef [[C:%.*]], i32 noundef [[IDX:%.*]]) #[[ATTR0]] { |
| // CHECK-GFX942-NEXT: [[ENTRY:.*:]] |
| // CHECK-GFX942-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-GFX942-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) |
| // CHECK-GFX942-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) |
| // CHECK-GFX942-NEXT: [[C_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) |
| // CHECK-GFX942-NEXT: [[IDX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-GFX942-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: [[IDX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IDX_ADDR]] to ptr |
| // CHECK-GFX942-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: store <16 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 64 |
| // CHECK-GFX942-NEXT: store i32 [[IDX]], ptr [[IDX_ADDR_ASCAST]], align 4 |
| // CHECK-GFX942-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 |
| // CHECK-GFX942-NEXT: [[TMP2:%.*]] = load <16 x float>, ptr [[C_ADDR_ASCAST]], align 64 |
| // CHECK-GFX942-NEXT: [[TMP3:%.*]] = load i32, ptr [[IDX_ADDR_ASCAST]], align 4 |
| // CHECK-GFX942-NEXT: [[TMP4:%.*]] = call contract <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <16 x float> [[TMP2]], i32 [[TMP3]], i32 0, i32 0) |
| // CHECK-GFX942-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 |
| // CHECK-GFX942-NEXT: store <16 x float> [[TMP4]], ptr [[TMP5]], align 64 |
| // CHECK-GFX942-NEXT: ret void |
| // |
| __device__ void test_smfmac_f32_32x32x16_f16_hip(v16f* out, v4h a, v8h b, v16f c, int idx) { |
| *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); |
| } |