| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90 |
| ; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100 |
| ; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} |
| |
| ; For 256-bit vectors, check that invariant loads from the |
| ; global addrspace are lowered to ld.global.nc. |
| |
| define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v32i8( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b16 %rs<16>; |
| ; SM90-NEXT: .reg .b32 %r<19>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs1, %r5; |
| ; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs2, %r6; |
| ; SM90-NEXT: bfe.u32 %r7, %r2, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs3, %r7; |
| ; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs4, %r8; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1]; |
| ; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs5, %r13; |
| ; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs6, %r14; |
| ; SM90-NEXT: bfe.u32 %r15, %r10, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs7, %r15; |
| ; SM90-NEXT: bfe.u32 %r16, %r9, 0, 8; |
| ; SM90-NEXT: cvt.u16.u32 %rs8, %r16; |
| ; SM90-NEXT: add.s16 %rs9, %rs8, %rs7; |
| ; SM90-NEXT: add.s16 %rs10, %rs6, %rs5; |
| ; SM90-NEXT: add.s16 %rs11, %rs4, %rs3; |
| ; SM90-NEXT: add.s16 %rs12, %rs2, %rs1; |
| ; SM90-NEXT: add.s16 %rs13, %rs9, %rs10; |
| ; SM90-NEXT: add.s16 %rs14, %rs11, %rs12; |
| ; SM90-NEXT: add.s16 %rs15, %rs13, %rs14; |
| ; SM90-NEXT: cvt.u32.u16 %r17, %rs15; |
| ; SM90-NEXT: and.b32 %r18, %r17, 255; |
| ; SM90-NEXT: st.param.b32 [func_retval0], %r18; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v32i8( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b16 %rs<16>; |
| ; SM100-NEXT: .reg .b32 %r<19>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs1, %r9; |
| ; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs2, %r10; |
| ; SM100-NEXT: bfe.u32 %r11, %r6, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs3, %r11; |
| ; SM100-NEXT: bfe.u32 %r12, %r5, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs4, %r12; |
| ; SM100-NEXT: bfe.u32 %r13, %r4, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs5, %r13; |
| ; SM100-NEXT: bfe.u32 %r14, %r3, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs6, %r14; |
| ; SM100-NEXT: bfe.u32 %r15, %r2, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs7, %r15; |
| ; SM100-NEXT: bfe.u32 %r16, %r1, 0, 8; |
| ; SM100-NEXT: cvt.u16.u32 %rs8, %r16; |
| ; SM100-NEXT: add.s16 %rs9, %rs8, %rs7; |
| ; SM100-NEXT: add.s16 %rs10, %rs6, %rs5; |
| ; SM100-NEXT: add.s16 %rs11, %rs4, %rs3; |
| ; SM100-NEXT: add.s16 %rs12, %rs2, %rs1; |
| ; SM100-NEXT: add.s16 %rs13, %rs9, %rs10; |
| ; SM100-NEXT: add.s16 %rs14, %rs11, %rs12; |
| ; SM100-NEXT: add.s16 %rs15, %rs13, %rs14; |
| ; SM100-NEXT: cvt.u32.u16 %r17, %rs15; |
| ; SM100-NEXT: and.b32 %r18, %r17, 255; |
| ; SM100-NEXT: st.param.b32 [func_retval0], %r18; |
| ; SM100-NEXT: ret; |
| %a = load <32 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <32 x i8> %a, i32 0 |
| %v2 = extractelement <32 x i8> %a, i32 4 |
| %v3 = extractelement <32 x i8> %a, i32 8 |
| %v4 = extractelement <32 x i8> %a, i32 12 |
| %v5 = extractelement <32 x i8> %a, i32 16 |
| %v6 = extractelement <32 x i8> %a, i32 20 |
| %v7 = extractelement <32 x i8> %a, i32 24 |
| %v8 = extractelement <32 x i8> %a, i32 28 |
| %sum1 = add i8 %v1, %v2 |
| %sum2 = add i8 %v3, %v4 |
| %sum3 = add i8 %v5, %v6 |
| %sum4 = add i8 %v7, %v8 |
| %sum5 = add i8 %sum1, %sum2 |
| %sum6 = add i8 %sum3, %sum4 |
| %sum7 = add i8 %sum5, %sum6 |
| ret i8 %sum7 |
| } |
| |
| define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v16i16( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b16 %rs<16>; |
| ; SM90-NEXT: .reg .b32 %r<10>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; |
| ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; |
| ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; |
| ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; |
| ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; |
| ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; |
| ; SM90-NEXT: mov.b32 {%rs8, _}, %r5; |
| ; SM90-NEXT: add.s16 %rs9, %rs8, %rs7; |
| ; SM90-NEXT: add.s16 %rs10, %rs6, %rs5; |
| ; SM90-NEXT: add.s16 %rs11, %rs4, %rs3; |
| ; SM90-NEXT: add.s16 %rs12, %rs2, %rs1; |
| ; SM90-NEXT: add.s16 %rs13, %rs9, %rs10; |
| ; SM90-NEXT: add.s16 %rs14, %rs11, %rs12; |
| ; SM90-NEXT: add.s16 %rs15, %rs13, %rs14; |
| ; SM90-NEXT: cvt.u32.u16 %r9, %rs15; |
| ; SM90-NEXT: st.param.b32 [func_retval0], %r9; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v16i16( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b16 %rs<16>; |
| ; SM100-NEXT: .reg .b32 %r<10>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; |
| ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; |
| ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; |
| ; SM100-NEXT: mov.b32 {%rs4, _}, %r5; |
| ; SM100-NEXT: mov.b32 {%rs5, _}, %r4; |
| ; SM100-NEXT: mov.b32 {%rs6, _}, %r3; |
| ; SM100-NEXT: mov.b32 {%rs7, _}, %r2; |
| ; SM100-NEXT: mov.b32 {%rs8, _}, %r1; |
| ; SM100-NEXT: add.s16 %rs9, %rs8, %rs7; |
| ; SM100-NEXT: add.s16 %rs10, %rs6, %rs5; |
| ; SM100-NEXT: add.s16 %rs11, %rs4, %rs3; |
| ; SM100-NEXT: add.s16 %rs12, %rs2, %rs1; |
| ; SM100-NEXT: add.s16 %rs13, %rs9, %rs10; |
| ; SM100-NEXT: add.s16 %rs14, %rs11, %rs12; |
| ; SM100-NEXT: add.s16 %rs15, %rs13, %rs14; |
| ; SM100-NEXT: cvt.u32.u16 %r9, %rs15; |
| ; SM100-NEXT: st.param.b32 [func_retval0], %r9; |
| ; SM100-NEXT: ret; |
| %a = load <16 x i16>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <16 x i16> %a, i32 0 |
| %v2 = extractelement <16 x i16> %a, i32 2 |
| %v3 = extractelement <16 x i16> %a, i32 4 |
| %v4 = extractelement <16 x i16> %a, i32 6 |
| %v5 = extractelement <16 x i16> %a, i32 8 |
| %v6 = extractelement <16 x i16> %a, i32 10 |
| %v7 = extractelement <16 x i16> %a, i32 12 |
| %v8 = extractelement <16 x i16> %a, i32 14 |
| %sum1 = add i16 %v1, %v2 |
| %sum2 = add i16 %v3, %v4 |
| %sum3 = add i16 %v5, %v6 |
| %sum4 = add i16 %v7, %v8 |
| %sum5 = add i16 %sum1, %sum2 |
| %sum6 = add i16 %sum3, %sum4 |
| %sum7 = add i16 %sum5, %sum6 |
| ret i16 %sum7 |
| } |
| |
| define half @ld_global_v16f16(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v16f16( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b16 %rs<16>; |
| ; SM90-NEXT: .reg .b32 %r<9>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; |
| ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; |
| ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; |
| ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; |
| ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; |
| ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; |
| ; SM90-NEXT: mov.b32 {%rs8, _}, %r5; |
| ; SM90-NEXT: add.rn.f16 %rs9, %rs8, %rs7; |
| ; SM90-NEXT: add.rn.f16 %rs10, %rs6, %rs5; |
| ; SM90-NEXT: add.rn.f16 %rs11, %rs4, %rs3; |
| ; SM90-NEXT: add.rn.f16 %rs12, %rs2, %rs1; |
| ; SM90-NEXT: add.rn.f16 %rs13, %rs9, %rs10; |
| ; SM90-NEXT: add.rn.f16 %rs14, %rs11, %rs12; |
| ; SM90-NEXT: add.rn.f16 %rs15, %rs13, %rs14; |
| ; SM90-NEXT: st.param.b16 [func_retval0], %rs15; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v16f16( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b16 %rs<16>; |
| ; SM100-NEXT: .reg .b32 %r<9>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; |
| ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; |
| ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; |
| ; SM100-NEXT: mov.b32 {%rs4, _}, %r5; |
| ; SM100-NEXT: mov.b32 {%rs5, _}, %r4; |
| ; SM100-NEXT: mov.b32 {%rs6, _}, %r3; |
| ; SM100-NEXT: mov.b32 {%rs7, _}, %r2; |
| ; SM100-NEXT: mov.b32 {%rs8, _}, %r1; |
| ; SM100-NEXT: add.rn.f16 %rs9, %rs8, %rs7; |
| ; SM100-NEXT: add.rn.f16 %rs10, %rs6, %rs5; |
| ; SM100-NEXT: add.rn.f16 %rs11, %rs4, %rs3; |
| ; SM100-NEXT: add.rn.f16 %rs12, %rs2, %rs1; |
| ; SM100-NEXT: add.rn.f16 %rs13, %rs9, %rs10; |
| ; SM100-NEXT: add.rn.f16 %rs14, %rs11, %rs12; |
| ; SM100-NEXT: add.rn.f16 %rs15, %rs13, %rs14; |
| ; SM100-NEXT: st.param.b16 [func_retval0], %rs15; |
| ; SM100-NEXT: ret; |
| %a = load <16 x half>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <16 x half> %a, i32 0 |
| %v2 = extractelement <16 x half> %a, i32 2 |
| %v3 = extractelement <16 x half> %a, i32 4 |
| %v4 = extractelement <16 x half> %a, i32 6 |
| %v5 = extractelement <16 x half> %a, i32 8 |
| %v6 = extractelement <16 x half> %a, i32 10 |
| %v7 = extractelement <16 x half> %a, i32 12 |
| %v8 = extractelement <16 x half> %a, i32 14 |
| %sum1 = fadd half %v1, %v2 |
| %sum2 = fadd half %v3, %v4 |
| %sum3 = fadd half %v5, %v6 |
| %sum4 = fadd half %v7, %v8 |
| %sum5 = fadd half %sum1, %sum2 |
| %sum6 = fadd half %sum3, %sum4 |
| %sum7 = fadd half %sum5, %sum6 |
| ret half %sum7 |
| } |
| |
| define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v16bf16( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b16 %rs<16>; |
| ; SM90-NEXT: .reg .b32 %r<9>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; |
| ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; |
| ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; |
| ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; |
| ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; |
| ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; |
| ; SM90-NEXT: mov.b32 {%rs8, _}, %r5; |
| ; SM90-NEXT: add.rn.bf16 %rs9, %rs8, %rs7; |
| ; SM90-NEXT: add.rn.bf16 %rs10, %rs6, %rs5; |
| ; SM90-NEXT: add.rn.bf16 %rs11, %rs4, %rs3; |
| ; SM90-NEXT: add.rn.bf16 %rs12, %rs2, %rs1; |
| ; SM90-NEXT: add.rn.bf16 %rs13, %rs9, %rs10; |
| ; SM90-NEXT: add.rn.bf16 %rs14, %rs11, %rs12; |
| ; SM90-NEXT: add.rn.bf16 %rs15, %rs13, %rs14; |
| ; SM90-NEXT: st.param.b16 [func_retval0], %rs15; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v16bf16( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b16 %rs<16>; |
| ; SM100-NEXT: .reg .b32 %r<9>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; |
| ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; |
| ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; |
| ; SM100-NEXT: mov.b32 {%rs4, _}, %r5; |
| ; SM100-NEXT: mov.b32 {%rs5, _}, %r4; |
| ; SM100-NEXT: mov.b32 {%rs6, _}, %r3; |
| ; SM100-NEXT: mov.b32 {%rs7, _}, %r2; |
| ; SM100-NEXT: mov.b32 {%rs8, _}, %r1; |
| ; SM100-NEXT: add.rn.bf16 %rs9, %rs8, %rs7; |
| ; SM100-NEXT: add.rn.bf16 %rs10, %rs6, %rs5; |
| ; SM100-NEXT: add.rn.bf16 %rs11, %rs4, %rs3; |
| ; SM100-NEXT: add.rn.bf16 %rs12, %rs2, %rs1; |
| ; SM100-NEXT: add.rn.bf16 %rs13, %rs9, %rs10; |
| ; SM100-NEXT: add.rn.bf16 %rs14, %rs11, %rs12; |
| ; SM100-NEXT: add.rn.bf16 %rs15, %rs13, %rs14; |
| ; SM100-NEXT: st.param.b16 [func_retval0], %rs15; |
| ; SM100-NEXT: ret; |
| %a = load <16 x bfloat>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <16 x bfloat> %a, i32 0 |
| %v2 = extractelement <16 x bfloat> %a, i32 2 |
| %v3 = extractelement <16 x bfloat> %a, i32 4 |
| %v4 = extractelement <16 x bfloat> %a, i32 6 |
| %v5 = extractelement <16 x bfloat> %a, i32 8 |
| %v6 = extractelement <16 x bfloat> %a, i32 10 |
| %v7 = extractelement <16 x bfloat> %a, i32 12 |
| %v8 = extractelement <16 x bfloat> %a, i32 14 |
| %sum1 = fadd bfloat %v1, %v2 |
| %sum2 = fadd bfloat %v3, %v4 |
| %sum3 = fadd bfloat %v5, %v6 |
| %sum4 = fadd bfloat %v7, %v8 |
| %sum5 = fadd bfloat %sum1, %sum2 |
| %sum6 = fadd bfloat %sum3, %sum4 |
| %sum7 = fadd bfloat %sum5, %sum6 |
| ret bfloat %sum7 |
| } |
| |
| define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v8i32( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b32 %r<16>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM90-NEXT: add.s32 %r9, %r5, %r6; |
| ; SM90-NEXT: add.s32 %r10, %r7, %r8; |
| ; SM90-NEXT: add.s32 %r11, %r1, %r2; |
| ; SM90-NEXT: add.s32 %r12, %r3, %r4; |
| ; SM90-NEXT: add.s32 %r13, %r9, %r10; |
| ; SM90-NEXT: add.s32 %r14, %r11, %r12; |
| ; SM90-NEXT: add.s32 %r15, %r13, %r14; |
| ; SM90-NEXT: st.param.b32 [func_retval0], %r15; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v8i32( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b32 %r<16>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: add.s32 %r9, %r1, %r2; |
| ; SM100-NEXT: add.s32 %r10, %r3, %r4; |
| ; SM100-NEXT: add.s32 %r11, %r5, %r6; |
| ; SM100-NEXT: add.s32 %r12, %r7, %r8; |
| ; SM100-NEXT: add.s32 %r13, %r9, %r10; |
| ; SM100-NEXT: add.s32 %r14, %r11, %r12; |
| ; SM100-NEXT: add.s32 %r15, %r13, %r14; |
| ; SM100-NEXT: st.param.b32 [func_retval0], %r15; |
| ; SM100-NEXT: ret; |
| %a = load <8 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <8 x i32> %a, i32 0 |
| %v2 = extractelement <8 x i32> %a, i32 1 |
| %v3 = extractelement <8 x i32> %a, i32 2 |
| %v4 = extractelement <8 x i32> %a, i32 3 |
| %v5 = extractelement <8 x i32> %a, i32 4 |
| %v6 = extractelement <8 x i32> %a, i32 5 |
| %v7 = extractelement <8 x i32> %a, i32 6 |
| %v8 = extractelement <8 x i32> %a, i32 7 |
| %sum1 = add i32 %v1, %v2 |
| %sum2 = add i32 %v3, %v4 |
| %sum3 = add i32 %v5, %v6 |
| %sum4 = add i32 %v7, %v8 |
| %sum5 = add i32 %sum1, %sum2 |
| %sum6 = add i32 %sum3, %sum4 |
| %sum7 = add i32 %sum5, %sum6 |
| |
| ret i32 %sum7 |
| } |
| |
| define float @ld_global_v8f32(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v8f32( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b32 %r<16>; |
| ; SM90-NEXT: .reg .b64 %rd<2>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; |
| ; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM90-NEXT: add.rn.f32 %r9, %r5, %r6; |
| ; SM90-NEXT: add.rn.f32 %r10, %r7, %r8; |
| ; SM90-NEXT: add.rn.f32 %r11, %r1, %r2; |
| ; SM90-NEXT: add.rn.f32 %r12, %r3, %r4; |
| ; SM90-NEXT: add.rn.f32 %r13, %r9, %r10; |
| ; SM90-NEXT: add.rn.f32 %r14, %r11, %r12; |
| ; SM90-NEXT: add.rn.f32 %r15, %r13, %r14; |
| ; SM90-NEXT: st.param.b32 [func_retval0], %r15; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v8f32( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b32 %r<16>; |
| ; SM100-NEXT: .reg .b64 %rd<2>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0]; |
| ; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; SM100-NEXT: add.rn.f32 %r9, %r1, %r2; |
| ; SM100-NEXT: add.rn.f32 %r10, %r3, %r4; |
| ; SM100-NEXT: add.rn.f32 %r11, %r5, %r6; |
| ; SM100-NEXT: add.rn.f32 %r12, %r7, %r8; |
| ; SM100-NEXT: add.rn.f32 %r13, %r9, %r10; |
| ; SM100-NEXT: add.rn.f32 %r14, %r11, %r12; |
| ; SM100-NEXT: add.rn.f32 %r15, %r13, %r14; |
| ; SM100-NEXT: st.param.b32 [func_retval0], %r15; |
| ; SM100-NEXT: ret; |
| %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <8 x float> %a, i32 0 |
| %v2 = extractelement <8 x float> %a, i32 1 |
| %v3 = extractelement <8 x float> %a, i32 2 |
| %v4 = extractelement <8 x float> %a, i32 3 |
| %v5 = extractelement <8 x float> %a, i32 4 |
| %v6 = extractelement <8 x float> %a, i32 5 |
| %v7 = extractelement <8 x float> %a, i32 6 |
| %v8 = extractelement <8 x float> %a, i32 7 |
| %sum1 = fadd float %v1, %v2 |
| %sum2 = fadd float %v3, %v4 |
| %sum3 = fadd float %v5, %v6 |
| %sum4 = fadd float %v7, %v8 |
| %sum5 = fadd float %sum1, %sum2 |
| %sum6 = fadd float %sum3, %sum4 |
| %sum7 = fadd float %sum5, %sum6 |
| |
| ret float %sum7 |
| } |
| |
| define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v4i64( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b64 %rd<9>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0]; |
| ; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16]; |
| ; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1]; |
| ; SM90-NEXT: add.s64 %rd6, %rd4, %rd5; |
| ; SM90-NEXT: add.s64 %rd7, %rd2, %rd3; |
| ; SM90-NEXT: add.s64 %rd8, %rd6, %rd7; |
| ; SM90-NEXT: st.param.b64 [func_retval0], %rd8; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v4i64( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b64 %rd<9>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0]; |
| ; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; |
| ; SM100-NEXT: add.s64 %rd6, %rd2, %rd3; |
| ; SM100-NEXT: add.s64 %rd7, %rd4, %rd5; |
| ; SM100-NEXT: add.s64 %rd8, %rd6, %rd7; |
| ; SM100-NEXT: st.param.b64 [func_retval0], %rd8; |
| ; SM100-NEXT: ret; |
| %a = load <4 x i64>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <4 x i64> %a, i32 0 |
| %v2 = extractelement <4 x i64> %a, i32 1 |
| %v3 = extractelement <4 x i64> %a, i32 2 |
| %v4 = extractelement <4 x i64> %a, i32 3 |
| %sum1 = add i64 %v1, %v2 |
| %sum2 = add i64 %v3, %v4 |
| %sum3 = add i64 %sum1, %sum2 |
| ret i64 %sum3 |
| } |
| |
| define double @ld_global_v4f64(ptr addrspace(1) %ptr) { |
| ; SM90-LABEL: ld_global_v4f64( |
| ; SM90: { |
| ; SM90-NEXT: .reg .b64 %rd<9>; |
| ; SM90-EMPTY: |
| ; SM90-NEXT: // %bb.0: |
| ; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0]; |
| ; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16]; |
| ; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1]; |
| ; SM90-NEXT: add.rn.f64 %rd6, %rd4, %rd5; |
| ; SM90-NEXT: add.rn.f64 %rd7, %rd2, %rd3; |
| ; SM90-NEXT: add.rn.f64 %rd8, %rd6, %rd7; |
| ; SM90-NEXT: st.param.b64 [func_retval0], %rd8; |
| ; SM90-NEXT: ret; |
| ; |
| ; SM100-LABEL: ld_global_v4f64( |
| ; SM100: { |
| ; SM100-NEXT: .reg .b64 %rd<9>; |
| ; SM100-EMPTY: |
| ; SM100-NEXT: // %bb.0: |
| ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0]; |
| ; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; |
| ; SM100-NEXT: add.rn.f64 %rd6, %rd2, %rd3; |
| ; SM100-NEXT: add.rn.f64 %rd7, %rd4, %rd5; |
| ; SM100-NEXT: add.rn.f64 %rd8, %rd6, %rd7; |
| ; SM100-NEXT: st.param.b64 [func_retval0], %rd8; |
| ; SM100-NEXT: ret; |
| %a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0 |
| %v1 = extractelement <4 x double> %a, i32 0 |
| %v2 = extractelement <4 x double> %a, i32 1 |
| %v3 = extractelement <4 x double> %a, i32 2 |
| %v4 = extractelement <4 x double> %a, i32 3 |
| %sum1 = fadd double %v1, %v2 |
| %sum2 = fadd double %v3, %v4 |
| %sum3 = fadd double %sum1, %sum2 |
| ret double %sum3 |
| } |
| |
| !0 = !{} |