blob: 12c3cf14ee395afd232f3ae8b6fcdde848899a2c [file] [edit]
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
#define __device__ __attribute__((device))
typedef float v8f __attribute__((ext_vector_type(8)));
typedef _Float16 v16h __attribute__((ext_vector_type(16)));
// CHECK-GFX1100-LABEL: define dso_local void @_Z41test_amdgcn_wmma_f32_16x16x16_f16_w32_hipPDv8_fDv16_DF16_S1_S_(
// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-GFX1100-NEXT: [[ENTRY:.*:]]
// CHECK-GFX1100-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-GFX1100-NEXT: [[A_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
// CHECK-GFX1100-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP3:%.*]] = call contract <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[TMP0]], <16 x half> [[TMP1]], <8 x float> [[TMP2]])
// CHECK-GFX1100-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <8 x float> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-GFX1100-NEXT: ret void
//
__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w32_hip(v8f* out, v16h a, v16h b, v8f c) {
*out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a, b, c);
}
// CHECK-GFX1100-LABEL: define dso_local void @_Z41test_amdgcn_wmma_f16_16x16x16_f16_w32_hipPDv16_DF16_S_S_S_(
// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-GFX1100-NEXT: [[ENTRY:.*:]]
// CHECK-GFX1100-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-GFX1100-NEXT: [[A_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[C_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
// CHECK-GFX1100-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <16 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP3:%.*]] = call contract <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[TMP0]], <16 x half> [[TMP1]], <16 x half> [[TMP2]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-GFX1100-NEXT: ret void
//
__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w32_hip(v16h* out, v16h a, v16h b, v16h c) {
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c, true);
}
// CHECK-GFX1100-LABEL: define dso_local void @_Z46test_amdgcn_wmma_f16_16x16x16_f16_tied_w32_hipPDv16_DF16_S_S_S_(
// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-GFX1100-NEXT: [[ENTRY:.*:]]
// CHECK-GFX1100-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-GFX1100-NEXT: [[A_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[C_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
// CHECK-GFX1100-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-GFX1100-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
// CHECK-GFX1100-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: store <16 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR_ASCAST]], align 32
// CHECK-GFX1100-NEXT: [[TMP3:%.*]] = call contract <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[TMP0]], <16 x half> [[TMP1]], <16 x half> [[TMP2]], i1 true)
// CHECK-GFX1100-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-GFX1100-NEXT: store <16 x half> [[TMP3]], ptr [[TMP4]], align 32
// CHECK-GFX1100-NEXT: ret void
//
__device__ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32_hip(v16h* out, v16h a, v16h b, v16h c) {
*out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a, b, c, true);
}