| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck -check-prefix=SM20 %s |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -check-prefix=SM35 %s |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} |
| |
| target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" |
| target triple = "nvptx64-unknown-unknown" |
| |
| define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo1( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo1_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo1_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd2]; |
| ; SM20-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo1( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo1_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo1_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2]; |
| ; SM35-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load float, ptr %from |
| store float %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo2( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<6>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo2_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo2_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; |
| ; SM20-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo2( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<6>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo2_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo2_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; |
| ; SM35-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM35-NEXT: ret; |
| %1 = load double, ptr %from |
| store double %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo3( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b16 %rs<2>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo3_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo3_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b16 %rs1, [%rd2]; |
| ; SM20-NEXT: st.global.b16 [%rd4], %rs1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo3( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b16 %rs<2>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo3_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo3_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b16 %rs1, [%rd2]; |
| ; SM35-NEXT: st.global.b16 [%rd4], %rs1; |
| ; SM35-NEXT: ret; |
| %1 = load i16, ptr %from |
| store i16 %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo4( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo4_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo4_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd2]; |
| ; SM20-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo4( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo4_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo4_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2]; |
| ; SM35-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load i32, ptr %from |
| store i32 %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo5( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<6>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo5_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo5_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; |
| ; SM20-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo5( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<6>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo5_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo5_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; |
| ; SM35-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM35-NEXT: ret; |
| %1 = load i64, ptr %from |
| store i64 %1, ptr %to |
| ret void |
| } |
| |
| ; i128 is non standard integer in nvptx64 |
| define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo6( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<7>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo6_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo6_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo6( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<7>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo6_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo6_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM35-NEXT: ret; |
| %1 = load i128, ptr %from |
| store i128 %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo7( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b16 %rs<3>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo7_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo7_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b8 {%rs1, %rs2}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b8 [%rd4], {%rs1, %rs2}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo7( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b16 %rs<3>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo7_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo7_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b8 [%rd4], {%rs1, %rs2}; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x i8>, ptr %from |
| store <2 x i8> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo8( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo8_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo8_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd2]; |
| ; SM20-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo8( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo8_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo8_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2]; |
| ; SM35-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x i16>, ptr %from |
| store <2 x i16> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo9( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<3>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo9_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo9_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo9( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<3>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo9_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo9_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x i32>, ptr %from |
| store <2 x i32> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo10( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<7>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo10_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo10_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo10( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<7>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo10_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo10_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x i64>, ptr %from |
| store <2 x i64> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo11( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<3>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo11( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<3>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x float>, ptr %from |
| store <2 x float> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo12( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<7>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo12_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo12_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo12( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<7>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo12_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo12_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM35-NEXT: ret; |
| %1 = load <2 x double>, ptr %from |
| store <2 x double> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo13( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo13_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo13_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd2]; |
| ; SM20-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo13( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo13_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo13_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2]; |
| ; SM35-NEXT: st.global.b32 [%rd4], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load <4 x i8>, ptr %from |
| store <4 x i8> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo14( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<3>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo14_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo14_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo14( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<3>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo14_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo14_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; |
| ; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; |
| ; SM35-NEXT: ret; |
| %1 = load <4 x i16>, ptr %from |
| store <4 x i16> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo15( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<5>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo15_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo15_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; |
| ; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo15( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<5>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo15_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo15_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; |
| ; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; |
| ; SM35-NEXT: ret; |
| %1 = load <4 x i32>, ptr %from |
| store <4 x i32> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo16( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<5>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; |
| ; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo16( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<5>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; |
| ; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; |
| ; SM35-NEXT: ret; |
| %1 = load <4 x float>, ptr %from |
| store <4 x float> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo17( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<9>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo17_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo17_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM20-NEXT: ld.global.v2.b64 {%rd7, %rd8}, [%rd2+16]; |
| ; SM20-NEXT: st.global.v2.b64 [%rd4+16], {%rd7, %rd8}; |
| ; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo17( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<9>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo17_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo17_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; |
| ; SM35-NEXT: ld.global.nc.v2.b64 {%rd7, %rd8}, [%rd2+16]; |
| ; SM35-NEXT: st.global.v2.b64 [%rd4+16], {%rd7, %rd8}; |
| ; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; |
| ; SM35-NEXT: ret; |
| %1 = load <4 x double>, ptr %from |
| store <4 x double> %1, ptr %to |
| ret void |
| } |
| |
| define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: foo18( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b64 %rd<6>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [foo18_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo18_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; |
| ; SM20-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo18( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b64 %rd<6>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [foo18_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo18_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; |
| ; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; |
| ; SM35-NEXT: st.global.b64 [%rd4], %rd5; |
| ; SM35-NEXT: ret; |
| %1 = load ptr, ptr %from |
| store ptr %1, ptr %to |
| ret void |
| } |
| |
| ; Test that we can infer a cached load for a pointer induction variable. |
| define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { |
| ; SM20-LABEL: foo19( |
| ; SM20: { |
| ; SM20-NEXT: .reg .pred %p<2>; |
| ; SM20-NEXT: .reg .b32 %r<4>; |
| ; SM20-NEXT: .reg .b64 %rd<5>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: // %entry |
| ; SM20-NEXT: ld.param.b32 %r2, [foo19_param_2]; |
| ; SM20-NEXT: ld.param.b64 %rd2, [foo19_param_0]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd2; |
| ; SM20-NEXT: ld.param.b64 %rd3, [foo19_param_1]; |
| ; SM20-NEXT: cvta.to.global.u64 %rd1, %rd3; |
| ; SM20-NEXT: mov.b32 %r3, 0f00000000; |
| ; SM20-NEXT: $L__BB18_1: // %loop |
| ; SM20-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd4]; |
| ; SM20-NEXT: add.rn.f32 %r3, %r1, %r3; |
| ; SM20-NEXT: add.s64 %rd4, %rd4, 4; |
| ; SM20-NEXT: add.s32 %r2, %r2, -1; |
| ; SM20-NEXT: setp.ne.b32 %p1, %r2, 0; |
| ; SM20-NEXT: @%p1 bra $L__BB18_1; |
| ; SM20-NEXT: // %bb.2: // %exit |
| ; SM20-NEXT: st.global.b32 [%rd1], %r3; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: foo19( |
| ; SM35: { |
| ; SM35-NEXT: .reg .pred %p<2>; |
| ; SM35-NEXT: .reg .b32 %r<4>; |
| ; SM35-NEXT: .reg .b64 %rd<5>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: // %entry |
| ; SM35-NEXT: ld.param.b32 %r2, [foo19_param_2]; |
| ; SM35-NEXT: ld.param.b64 %rd2, [foo19_param_0]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd2; |
| ; SM35-NEXT: ld.param.b64 %rd3, [foo19_param_1]; |
| ; SM35-NEXT: cvta.to.global.u64 %rd1, %rd3; |
| ; SM35-NEXT: mov.b32 %r3, 0f00000000; |
| ; SM35-NEXT: $L__BB18_1: // %loop |
| ; SM35-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; SM35-NEXT: ld.global.nc.b32 %r1, [%rd4]; |
| ; SM35-NEXT: add.rn.f32 %r3, %r1, %r3; |
| ; SM35-NEXT: add.s64 %rd4, %rd4, 4; |
| ; SM35-NEXT: add.s32 %r2, %r2, -1; |
| ; SM35-NEXT: setp.ne.b32 %p1, %r2, 0; |
| ; SM35-NEXT: @%p1 bra $L__BB18_1; |
| ; SM35-NEXT: // %bb.2: // %exit |
| ; SM35-NEXT: st.global.b32 [%rd1], %r3; |
| ; SM35-NEXT: ret; |
| entry: |
| br label %loop |
| |
| loop: |
| %i = phi i32 [ 0, %entry ], [ %nexti, %loop ] |
| %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ] |
| %ptr = getelementptr inbounds float, ptr %from, i32 %i |
| %value = load float, ptr %ptr, align 4 |
| %nextsum = fadd float %value, %sum |
| %nexti = add nsw i32 %i, 1 |
| %exitcond = icmp eq i32 %nexti, %n |
| br i1 %exitcond, label %exit, label %loop |
| |
| exit: |
| store float %nextsum, ptr %to |
| ret void |
| } |
| |
| ; This test captures the case of a non-kernel function. In a |
| ; non-kernel function, without interprocedural analysis, we do not |
| ; know that the parameter is global. We also do not know that the |
| ; pointed-to memory is never written to (for the duration of the |
| ; kernel). For both reasons, we cannot use a cached load here. |
| define void @notkernel(ptr noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: notkernel( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<3>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [notkernel_param_0]; |
| ; SM20-NEXT: ld.b32 %r1, [%rd1]; |
| ; SM20-NEXT: ld.param.b64 %rd2, [notkernel_param_1]; |
| ; SM20-NEXT: st.b32 [%rd2], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: notkernel( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<3>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [notkernel_param_0]; |
| ; SM35-NEXT: ld.b32 %r1, [%rd1]; |
| ; SM35-NEXT: ld.param.b64 %rd2, [notkernel_param_1]; |
| ; SM35-NEXT: st.b32 [%rd2], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load float, ptr %from |
| store float %1, ptr %to |
| ret void |
| } |
| |
| ; As @notkernel, but with the parameter explicitly marked as global. We still |
| ; do not know that the parameter is never written to (for the duration of the |
| ; kernel). This case does not currently come up normally since we do not infer |
| ; that pointers are global interprocedurally as of 2015-08-05. |
| define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) { |
| ; SM20-LABEL: notkernel2( |
| ; SM20: { |
| ; SM20-NEXT: .reg .b32 %r<2>; |
| ; SM20-NEXT: .reg .b64 %rd<3>; |
| ; SM20-EMPTY: |
| ; SM20-NEXT: // %bb.0: |
| ; SM20-NEXT: ld.param.b64 %rd1, [notkernel2_param_0]; |
| ; SM20-NEXT: ld.global.b32 %r1, [%rd1]; |
| ; SM20-NEXT: ld.param.b64 %rd2, [notkernel2_param_1]; |
| ; SM20-NEXT: st.b32 [%rd2], %r1; |
| ; SM20-NEXT: ret; |
| ; |
| ; SM35-LABEL: notkernel2( |
| ; SM35: { |
| ; SM35-NEXT: .reg .b32 %r<2>; |
| ; SM35-NEXT: .reg .b64 %rd<3>; |
| ; SM35-EMPTY: |
| ; SM35-NEXT: // %bb.0: |
| ; SM35-NEXT: ld.param.b64 %rd1, [notkernel2_param_0]; |
| ; SM35-NEXT: ld.global.b32 %r1, [%rd1]; |
| ; SM35-NEXT: ld.param.b64 %rd2, [notkernel2_param_1]; |
| ; SM35-NEXT: st.b32 [%rd2], %r1; |
| ; SM35-NEXT: ret; |
| %1 = load float, ptr addrspace(1) %from |
| store float %1, ptr %to |
| ret void |
| } |