| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 |
| // REQUIRES: amdgpu-registered-target |
| // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s |
| // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1011 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s |
| // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1012 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s |
| |
| #define __device__ __attribute__((device)) |
| |
| typedef unsigned int uint; |
| typedef _Float16 __attribute__((ext_vector_type(2))) half2; |
| typedef short __attribute__((ext_vector_type(2))) short2; |
| typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_fdot2_hipPfDv2_DF16_S0_f( |
| // CHECK-SAME: ptr noundef [[FOUT:%.*]], <2 x half> noundef [[V2HA:%.*]], <2 x half> noundef [[V2HB:%.*]], float noundef [[FC:%.*]]) #[[ATTR0:[0-9]+]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[FOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[V2HA_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2HB_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) |
| // CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) |
| // CHECK-NEXT: [[FOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[V2HA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2HA_ADDR]] to ptr |
| // CHECK-NEXT: [[V2HB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2HB_ADDR]] to ptr |
| // CHECK-NEXT: [[FC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[FOUT]], ptr [[FOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x half> [[V2HA]], ptr [[V2HA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store <2 x half> [[V2HB]], ptr [[V2HB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store float [[FC]], ptr [[FC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[V2HA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[V2HB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call contract float @llvm.amdgcn.fdot2(<2 x half> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[FOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load <2 x half>, ptr [[V2HA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr [[V2HB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[FC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.fdot2(<2 x half> [[TMP5]], <2 x half> [[TMP6]], float [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[FOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store float [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_fdot2_hip(float* fOut, half2 v2hA, half2 v2hB, float fC) { |
| fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); |
| fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot2_hipPiDv2_sS0_i( |
| // CHECK-SAME: ptr noundef [[SIOUT:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[V2SSA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSA_ADDR]] to ptr |
| // CHECK-NEXT: [[V2SSB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2SSB_ADDR]] to ptr |
| // CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr [[V2SSA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr [[V2SSB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[V2SSA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[V2SSB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot2(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[V2SSA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[V2SSB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot2(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_sdot2_hip(int* siOut, short2 v2ssA, short2 v2ssB, int siC) { |
| siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); |
| siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot2_hipPjDv2_tS0_j( |
| // CHECK-SAME: ptr noundef [[UIOUT:%.*]], <2 x i16> noundef [[V2USA:%.*]], <2 x i16> noundef [[V2USB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[V2USA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: [[V2USB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[V2USA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2USA_ADDR]] to ptr |
| // CHECK-NEXT: [[V2USB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V2USB_ADDR]] to ptr |
| // CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store <2 x i16> [[V2USA]], ptr [[V2USA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store <2 x i16> [[V2USB]], ptr [[V2USB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[V2USA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[V2USB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot2(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[V2USA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[V2USB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot2(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_udot2_hip(uint* uiOut, ushort2 v2usA, ushort2 v2usB, uint uiC) { |
| uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); |
| uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot4_hipPiiii( |
| // CHECK-SAME: ptr noundef [[SIOUT:%.*]], i32 noundef [[SIA:%.*]], i32 noundef [[SIB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[SIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[SIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIA_ADDR]] to ptr |
| // CHECK-NEXT: [[SIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIB_ADDR]] to ptr |
| // CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[SIA]], ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SIB]], ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot4(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot4(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_sdot4_hip(int* siOut, int siA, int siB, int siC) { |
| siOut[0] = __builtin_amdgcn_sdot4(siA, siB, siC, false); |
| siOut[1] = __builtin_amdgcn_sdot4(siA, siB, siC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot4_hipPjjjj( |
| // CHECK-SAME: ptr noundef [[UIOUT:%.*]], i32 noundef [[UIA:%.*]], i32 noundef [[UIB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[UIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[UIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIA_ADDR]] to ptr |
| // CHECK-NEXT: [[UIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIB_ADDR]] to ptr |
| // CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[UIA]], ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[UIB]], ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot4(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot4(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_udot4_hip(uint* uiOut, uint uiA, uint uiB, uint uiC) { |
| uiOut[0] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); |
| uiOut[1] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_sdot8_hipPiiii( |
| // CHECK-SAME: ptr noundef [[SIOUT:%.*]], i32 noundef [[SIA:%.*]], i32 noundef [[SIB:%.*]], i32 noundef [[SIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[SIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[SIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[SIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[SIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIA_ADDR]] to ptr |
| // CHECK-NEXT: [[SIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIB_ADDR]] to ptr |
| // CHECK-NEXT: [[SIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[SIOUT]], ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[SIA]], ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SIB]], ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[SIC]], ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.sdot8(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[SIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.sdot8(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[SIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_sdot8_hip(int* siOut, int siA, int siB, int siC) { |
| siOut[0] = __builtin_amdgcn_sdot8(siA, siB, siC, false); |
| siOut[1] = __builtin_amdgcn_sdot8(siA, siB, siC, true); |
| } |
| |
| // CHECK-LABEL: define dso_local void @_Z21test_amdgcn_udot8_hipPjjjj( |
| // CHECK-SAME: ptr noundef [[UIOUT:%.*]], i32 noundef [[UIA:%.*]], i32 noundef [[UIB:%.*]], i32 noundef [[UIC:%.*]]) #[[ATTR0]] { |
| // CHECK-NEXT: [[ENTRY:.*:]] |
| // CHECK-NEXT: [[UIOUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) |
| // CHECK-NEXT: [[UIA_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIB_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) |
| // CHECK-NEXT: [[UIOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIOUT_ADDR]] to ptr |
| // CHECK-NEXT: [[UIA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIA_ADDR]] to ptr |
| // CHECK-NEXT: [[UIB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIB_ADDR]] to ptr |
| // CHECK-NEXT: [[UIC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UIC_ADDR]] to ptr |
| // CHECK-NEXT: store ptr [[UIOUT]], ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: store i32 [[UIA]], ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[UIB]], ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: store i32 [[UIC]], ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.udot8(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false) |
| // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0 |
| // CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4 |
| // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[UIA_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[UIB_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[UIC_ADDR_ASCAST]], align 4 |
| // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.udot8(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true) |
| // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[UIOUT_ADDR_ASCAST]], align 8 |
| // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 |
| // CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX1]], align 4 |
| // CHECK-NEXT: ret void |
| // |
| __device__ void test_amdgcn_udot8_hip(uint* uiOut, uint uiA, uint uiB, uint uiC) { |
| uiOut[0] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); |
| uiOut[1] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); |
| } |
| |