blob: e071eaaf107fc60c5a8311c2b8119bc9ac740542 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_101a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %}
define void @tcgen05_mma_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) {
; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta1(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta1_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta1_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta1_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta1_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta1_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta1_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta1_param_7];
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta1_param_1];
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
ret void
}
define void @tcgen05_mma_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) {
; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta2_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta2_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta2_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta2_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta2_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta2_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta2_param_7];
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta2_param_1];
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
ret void
}
define void @tcgen05_mma_sp_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) {
; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta1(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta1_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta1_param_7];
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta1_param_8];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta1_param_1];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
ret void
}
define void @tcgen05_mma_sp_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) {
; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta2_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta2_param_7];
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta2_param_8];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta2_param_1];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
ret void
}
define void @tcgen05_mma_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) {
; CHECK-LABEL: tcgen05_mma_mxf4_cta1(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta1_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta1_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta1_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta1_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta1_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta1_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta1_param_7];
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta1_param_1];
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
ret void
}
define void @tcgen05_mma_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) {
; CHECK-LABEL: tcgen05_mma_mxf4_cta2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta2_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta2_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta2_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta2_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta2_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta2_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta2_param_7];
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta2_param_1];
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
ret void
}
define void @tcgen05_mma_sp_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) {
; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta1(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta1_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta1_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta1_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta1_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta1_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta1_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta1_param_7];
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta1_param_8];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta1_param_1];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3)
ret void
}
define void @tcgen05_mma_sp_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) {
; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta2_param_5];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta2_param_0];
; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta2_param_2];
; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta2_param_3];
; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta2_param_4];
; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta2_param_6];
; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta2_param_7];
; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta2_param_8];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta2_param_1];
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1;
; CHECK-NEXT: ret;
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2)
call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3)
ret void
}