blob: b4a74c762f5235cf3c37f98deca271557c5a39be [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90
; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100
; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
; For 256-bit vectors, check that invariant loads from the
; global addrspace are lowered to ld.global.nc.
define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v32i8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<16>;
; SM90-NEXT: .reg .b32 %r<19>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs1, %r5;
; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs2, %r6;
; SM90-NEXT: bfe.u32 %r7, %r2, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs3, %r7;
; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs4, %r8;
; SM90-NEXT: ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1];
; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs5, %r13;
; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs6, %r14;
; SM90-NEXT: bfe.u32 %r15, %r10, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs7, %r15;
; SM90-NEXT: bfe.u32 %r16, %r9, 0, 8;
; SM90-NEXT: cvt.u16.u32 %rs8, %r16;
; SM90-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM90-NEXT: add.s16 %rs10, %rs6, %rs5;
; SM90-NEXT: add.s16 %rs11, %rs4, %rs3;
; SM90-NEXT: add.s16 %rs12, %rs2, %rs1;
; SM90-NEXT: add.s16 %rs13, %rs9, %rs10;
; SM90-NEXT: add.s16 %rs14, %rs11, %rs12;
; SM90-NEXT: add.s16 %rs15, %rs13, %rs14;
; SM90-NEXT: cvt.u32.u16 %r17, %rs15;
; SM90-NEXT: and.b32 %r18, %r17, 255;
; SM90-NEXT: st.param.b32 [func_retval0], %r18;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v32i8(
; SM100: {
; SM100-NEXT: .reg .b16 %rs<16>;
; SM100-NEXT: .reg .b32 %r<19>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs1, %r9;
; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs2, %r10;
; SM100-NEXT: bfe.u32 %r11, %r6, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs3, %r11;
; SM100-NEXT: bfe.u32 %r12, %r5, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs4, %r12;
; SM100-NEXT: bfe.u32 %r13, %r4, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs5, %r13;
; SM100-NEXT: bfe.u32 %r14, %r3, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs6, %r14;
; SM100-NEXT: bfe.u32 %r15, %r2, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs7, %r15;
; SM100-NEXT: bfe.u32 %r16, %r1, 0, 8;
; SM100-NEXT: cvt.u16.u32 %rs8, %r16;
; SM100-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM100-NEXT: add.s16 %rs10, %rs6, %rs5;
; SM100-NEXT: add.s16 %rs11, %rs4, %rs3;
; SM100-NEXT: add.s16 %rs12, %rs2, %rs1;
; SM100-NEXT: add.s16 %rs13, %rs9, %rs10;
; SM100-NEXT: add.s16 %rs14, %rs11, %rs12;
; SM100-NEXT: add.s16 %rs15, %rs13, %rs14;
; SM100-NEXT: cvt.u32.u16 %r17, %rs15;
; SM100-NEXT: and.b32 %r18, %r17, 255;
; SM100-NEXT: st.param.b32 [func_retval0], %r18;
; SM100-NEXT: ret;
%a = load <32 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <32 x i8> %a, i32 0
%v2 = extractelement <32 x i8> %a, i32 4
%v3 = extractelement <32 x i8> %a, i32 8
%v4 = extractelement <32 x i8> %a, i32 12
%v5 = extractelement <32 x i8> %a, i32 16
%v6 = extractelement <32 x i8> %a, i32 20
%v7 = extractelement <32 x i8> %a, i32 24
%v8 = extractelement <32 x i8> %a, i32 28
%sum1 = add i8 %v1, %v2
%sum2 = add i8 %v3, %v4
%sum3 = add i8 %v5, %v6
%sum4 = add i8 %v7, %v8
%sum5 = add i8 %sum1, %sum2
%sum6 = add i8 %sum3, %sum4
%sum7 = add i8 %sum5, %sum6
ret i8 %sum7
}
define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v16i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<16>;
; SM90-NEXT: .reg .b32 %r<10>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
; SM90-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM90-NEXT: add.s16 %rs10, %rs6, %rs5;
; SM90-NEXT: add.s16 %rs11, %rs4, %rs3;
; SM90-NEXT: add.s16 %rs12, %rs2, %rs1;
; SM90-NEXT: add.s16 %rs13, %rs9, %rs10;
; SM90-NEXT: add.s16 %rs14, %rs11, %rs12;
; SM90-NEXT: add.s16 %rs15, %rs13, %rs14;
; SM90-NEXT: cvt.u32.u16 %r9, %rs15;
; SM90-NEXT: st.param.b32 [func_retval0], %r9;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v16i16(
; SM100: {
; SM100-NEXT: .reg .b16 %rs<16>;
; SM100-NEXT: .reg .b32 %r<10>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
; SM100-NEXT: add.s16 %rs9, %rs8, %rs7;
; SM100-NEXT: add.s16 %rs10, %rs6, %rs5;
; SM100-NEXT: add.s16 %rs11, %rs4, %rs3;
; SM100-NEXT: add.s16 %rs12, %rs2, %rs1;
; SM100-NEXT: add.s16 %rs13, %rs9, %rs10;
; SM100-NEXT: add.s16 %rs14, %rs11, %rs12;
; SM100-NEXT: add.s16 %rs15, %rs13, %rs14;
; SM100-NEXT: cvt.u32.u16 %r9, %rs15;
; SM100-NEXT: st.param.b32 [func_retval0], %r9;
; SM100-NEXT: ret;
%a = load <16 x i16>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <16 x i16> %a, i32 0
%v2 = extractelement <16 x i16> %a, i32 2
%v3 = extractelement <16 x i16> %a, i32 4
%v4 = extractelement <16 x i16> %a, i32 6
%v5 = extractelement <16 x i16> %a, i32 8
%v6 = extractelement <16 x i16> %a, i32 10
%v7 = extractelement <16 x i16> %a, i32 12
%v8 = extractelement <16 x i16> %a, i32 14
%sum1 = add i16 %v1, %v2
%sum2 = add i16 %v3, %v4
%sum3 = add i16 %v5, %v6
%sum4 = add i16 %v7, %v8
%sum5 = add i16 %sum1, %sum2
%sum6 = add i16 %sum3, %sum4
%sum7 = add i16 %sum5, %sum6
ret i16 %sum7
}
define half @ld_global_v16f16(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v16f16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<16>;
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
; SM90-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
; SM90-NEXT: add.rn.f16 %rs10, %rs6, %rs5;
; SM90-NEXT: add.rn.f16 %rs11, %rs4, %rs3;
; SM90-NEXT: add.rn.f16 %rs12, %rs2, %rs1;
; SM90-NEXT: add.rn.f16 %rs13, %rs9, %rs10;
; SM90-NEXT: add.rn.f16 %rs14, %rs11, %rs12;
; SM90-NEXT: add.rn.f16 %rs15, %rs13, %rs14;
; SM90-NEXT: st.param.b16 [func_retval0], %rs15;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v16f16(
; SM100: {
; SM100-NEXT: .reg .b16 %rs<16>;
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
; SM100-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
; SM100-NEXT: add.rn.f16 %rs10, %rs6, %rs5;
; SM100-NEXT: add.rn.f16 %rs11, %rs4, %rs3;
; SM100-NEXT: add.rn.f16 %rs12, %rs2, %rs1;
; SM100-NEXT: add.rn.f16 %rs13, %rs9, %rs10;
; SM100-NEXT: add.rn.f16 %rs14, %rs11, %rs12;
; SM100-NEXT: add.rn.f16 %rs15, %rs13, %rs14;
; SM100-NEXT: st.param.b16 [func_retval0], %rs15;
; SM100-NEXT: ret;
%a = load <16 x half>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <16 x half> %a, i32 0
%v2 = extractelement <16 x half> %a, i32 2
%v3 = extractelement <16 x half> %a, i32 4
%v4 = extractelement <16 x half> %a, i32 6
%v5 = extractelement <16 x half> %a, i32 8
%v6 = extractelement <16 x half> %a, i32 10
%v7 = extractelement <16 x half> %a, i32 12
%v8 = extractelement <16 x half> %a, i32 14
%sum1 = fadd half %v1, %v2
%sum2 = fadd half %v3, %v4
%sum3 = fadd half %v5, %v6
%sum4 = fadd half %v7, %v8
%sum5 = fadd half %sum1, %sum2
%sum6 = fadd half %sum3, %sum4
%sum7 = fadd half %sum5, %sum6
ret half %sum7
}
define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v16bf16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<16>;
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
; SM90-NEXT: add.rn.bf16 %rs9, %rs8, %rs7;
; SM90-NEXT: add.rn.bf16 %rs10, %rs6, %rs5;
; SM90-NEXT: add.rn.bf16 %rs11, %rs4, %rs3;
; SM90-NEXT: add.rn.bf16 %rs12, %rs2, %rs1;
; SM90-NEXT: add.rn.bf16 %rs13, %rs9, %rs10;
; SM90-NEXT: add.rn.bf16 %rs14, %rs11, %rs12;
; SM90-NEXT: add.rn.bf16 %rs15, %rs13, %rs14;
; SM90-NEXT: st.param.b16 [func_retval0], %rs15;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v16bf16(
; SM100: {
; SM100-NEXT: .reg .b16 %rs<16>;
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
; SM100-NEXT: add.rn.bf16 %rs9, %rs8, %rs7;
; SM100-NEXT: add.rn.bf16 %rs10, %rs6, %rs5;
; SM100-NEXT: add.rn.bf16 %rs11, %rs4, %rs3;
; SM100-NEXT: add.rn.bf16 %rs12, %rs2, %rs1;
; SM100-NEXT: add.rn.bf16 %rs13, %rs9, %rs10;
; SM100-NEXT: add.rn.bf16 %rs14, %rs11, %rs12;
; SM100-NEXT: add.rn.bf16 %rs15, %rs13, %rs14;
; SM100-NEXT: st.param.b16 [func_retval0], %rs15;
; SM100-NEXT: ret;
%a = load <16 x bfloat>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <16 x bfloat> %a, i32 0
%v2 = extractelement <16 x bfloat> %a, i32 2
%v3 = extractelement <16 x bfloat> %a, i32 4
%v4 = extractelement <16 x bfloat> %a, i32 6
%v5 = extractelement <16 x bfloat> %a, i32 8
%v6 = extractelement <16 x bfloat> %a, i32 10
%v7 = extractelement <16 x bfloat> %a, i32 12
%v8 = extractelement <16 x bfloat> %a, i32 14
%sum1 = fadd bfloat %v1, %v2
%sum2 = fadd bfloat %v3, %v4
%sum3 = fadd bfloat %v5, %v6
%sum4 = fadd bfloat %v7, %v8
%sum5 = fadd bfloat %sum1, %sum2
%sum6 = fadd bfloat %sum3, %sum4
%sum7 = fadd bfloat %sum5, %sum6
ret bfloat %sum7
}
define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v8i32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<16>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
; SM90-NEXT: add.s32 %r9, %r5, %r6;
; SM90-NEXT: add.s32 %r10, %r7, %r8;
; SM90-NEXT: add.s32 %r11, %r1, %r2;
; SM90-NEXT: add.s32 %r12, %r3, %r4;
; SM90-NEXT: add.s32 %r13, %r9, %r10;
; SM90-NEXT: add.s32 %r14, %r11, %r12;
; SM90-NEXT: add.s32 %r15, %r13, %r14;
; SM90-NEXT: st.param.b32 [func_retval0], %r15;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v8i32(
; SM100: {
; SM100-NEXT: .reg .b32 %r<16>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: add.s32 %r9, %r1, %r2;
; SM100-NEXT: add.s32 %r10, %r3, %r4;
; SM100-NEXT: add.s32 %r11, %r5, %r6;
; SM100-NEXT: add.s32 %r12, %r7, %r8;
; SM100-NEXT: add.s32 %r13, %r9, %r10;
; SM100-NEXT: add.s32 %r14, %r11, %r12;
; SM100-NEXT: add.s32 %r15, %r13, %r14;
; SM100-NEXT: st.param.b32 [func_retval0], %r15;
; SM100-NEXT: ret;
%a = load <8 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <8 x i32> %a, i32 0
%v2 = extractelement <8 x i32> %a, i32 1
%v3 = extractelement <8 x i32> %a, i32 2
%v4 = extractelement <8 x i32> %a, i32 3
%v5 = extractelement <8 x i32> %a, i32 4
%v6 = extractelement <8 x i32> %a, i32 5
%v7 = extractelement <8 x i32> %a, i32 6
%v8 = extractelement <8 x i32> %a, i32 7
%sum1 = add i32 %v1, %v2
%sum2 = add i32 %v3, %v4
%sum3 = add i32 %v5, %v6
%sum4 = add i32 %v7, %v8
%sum5 = add i32 %sum1, %sum2
%sum6 = add i32 %sum3, %sum4
%sum7 = add i32 %sum5, %sum6
ret i32 %sum7
}
define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v8f32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<16>;
; SM90-NEXT: .reg .b64 %rd<2>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0];
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
; SM90-NEXT: add.rn.f32 %r9, %r5, %r6;
; SM90-NEXT: add.rn.f32 %r10, %r7, %r8;
; SM90-NEXT: add.rn.f32 %r11, %r1, %r2;
; SM90-NEXT: add.rn.f32 %r12, %r3, %r4;
; SM90-NEXT: add.rn.f32 %r13, %r9, %r10;
; SM90-NEXT: add.rn.f32 %r14, %r11, %r12;
; SM90-NEXT: add.rn.f32 %r15, %r13, %r14;
; SM90-NEXT: st.param.b32 [func_retval0], %r15;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v8f32(
; SM100: {
; SM100-NEXT: .reg .b32 %r<16>;
; SM100-NEXT: .reg .b64 %rd<2>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0];
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: add.rn.f32 %r9, %r1, %r2;
; SM100-NEXT: add.rn.f32 %r10, %r3, %r4;
; SM100-NEXT: add.rn.f32 %r11, %r5, %r6;
; SM100-NEXT: add.rn.f32 %r12, %r7, %r8;
; SM100-NEXT: add.rn.f32 %r13, %r9, %r10;
; SM100-NEXT: add.rn.f32 %r14, %r11, %r12;
; SM100-NEXT: add.rn.f32 %r15, %r13, %r14;
; SM100-NEXT: st.param.b32 [func_retval0], %r15;
; SM100-NEXT: ret;
%a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <8 x float> %a, i32 0
%v2 = extractelement <8 x float> %a, i32 1
%v3 = extractelement <8 x float> %a, i32 2
%v4 = extractelement <8 x float> %a, i32 3
%v5 = extractelement <8 x float> %a, i32 4
%v6 = extractelement <8 x float> %a, i32 5
%v7 = extractelement <8 x float> %a, i32 6
%v8 = extractelement <8 x float> %a, i32 7
%sum1 = fadd float %v1, %v2
%sum2 = fadd float %v3, %v4
%sum3 = fadd float %v5, %v6
%sum4 = fadd float %v7, %v8
%sum5 = fadd float %sum1, %sum2
%sum6 = fadd float %sum3, %sum4
%sum7 = fadd float %sum5, %sum6
ret float %sum7
}
define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v4i64(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<9>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0];
; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16];
; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1];
; SM90-NEXT: add.s64 %rd6, %rd4, %rd5;
; SM90-NEXT: add.s64 %rd7, %rd2, %rd3;
; SM90-NEXT: add.s64 %rd8, %rd6, %rd7;
; SM90-NEXT: st.param.b64 [func_retval0], %rd8;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v4i64(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<9>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0];
; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: add.s64 %rd6, %rd2, %rd3;
; SM100-NEXT: add.s64 %rd7, %rd4, %rd5;
; SM100-NEXT: add.s64 %rd8, %rd6, %rd7;
; SM100-NEXT: st.param.b64 [func_retval0], %rd8;
; SM100-NEXT: ret;
%a = load <4 x i64>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <4 x i64> %a, i32 0
%v2 = extractelement <4 x i64> %a, i32 1
%v3 = extractelement <4 x i64> %a, i32 2
%v4 = extractelement <4 x i64> %a, i32 3
%sum1 = add i64 %v1, %v2
%sum2 = add i64 %v3, %v4
%sum3 = add i64 %sum1, %sum2
ret i64 %sum3
}
define double @ld_global_v4f64(ptr addrspace(1) %ptr) {
; SM90-LABEL: ld_global_v4f64(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<9>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0];
; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16];
; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1];
; SM90-NEXT: add.rn.f64 %rd6, %rd4, %rd5;
; SM90-NEXT: add.rn.f64 %rd7, %rd2, %rd3;
; SM90-NEXT: add.rn.f64 %rd8, %rd6, %rd7;
; SM90-NEXT: st.param.b64 [func_retval0], %rd8;
; SM90-NEXT: ret;
;
; SM100-LABEL: ld_global_v4f64(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<9>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0];
; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: add.rn.f64 %rd6, %rd2, %rd3;
; SM100-NEXT: add.rn.f64 %rd7, %rd4, %rd5;
; SM100-NEXT: add.rn.f64 %rd8, %rd6, %rd7;
; SM100-NEXT: st.param.b64 [func_retval0], %rd8;
; SM100-NEXT: ret;
%a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0
%v1 = extractelement <4 x double> %a, i32 0
%v2 = extractelement <4 x double> %a, i32 1
%v3 = extractelement <4 x double> %a, i32 2
%v4 = extractelement <4 x double> %a, i32 3
%sum1 = fadd double %v1, %v2
%sum2 = fadd double %v3, %v4
%sum3 = fadd double %sum1, %sum2
ret double %sum3
}
!0 = !{}