blob: e6d3d4c286de97fbf75fffe436609e7e34a88b37 [file] [edit]
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out, align 32
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(<2 x float> %A, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x bfloat> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x float> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x half> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-LABEL: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
; CHECK-NEXT: bb:
; CHECK-NEXT: %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(<16 x bfloat> %A, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
; CHECK-NEXT: store <8 x bfloat> %res, ptr addrspace(1) %out
; CHECK-NEXT: ret void
; CHECK-NEXT: }
bb:
%res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
store <8 x bfloat> %res, ptr addrspace(1) %out
ret void
}