| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX |
| ; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} |
| |
| ; In this test, we check that all the addressing modes are lowered correctly, |
| ; addr can be any of the following: |
| ; - avar : direct address |
| ; - asi: direct address + offset |
| ; - areg_64: 64-bit register |
| ; - ari_64: 64-bit register + offset |
| ; Since this is a blackwell+ feature, |
| ; and support for 32-bit addressing does not exist after sm_90, |
| ; the "areg" and "ari" 32-bit addressing modes are not tested or supported. |
| |
| ; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double |
| |
| ; Global is the only address space that currently supports 256-bit loads/stores |
| |
| @globalin = external addrspace(1) global ptr |
| @globalout = external addrspace(1) global ptr |
| |
| define void @avar_i8() { |
| ; PTX-LABEL: avar_i8( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <32 x i8>, ptr addrspace(1) @globalin |
| store <32 x i8> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_i16() { |
| ; PTX-LABEL: avar_i16( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x i16>, ptr addrspace(1) @globalin |
| store <16 x i16> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_half() { |
| ; PTX-LABEL: avar_half( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x half>, ptr addrspace(1) @globalin |
| store <16 x half> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_bfloat() { |
| ; PTX-LABEL: avar_bfloat( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x bfloat>, ptr addrspace(1) @globalin |
| store <16 x bfloat> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_i32() { |
| ; PTX-LABEL: avar_i32( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <8 x i32>, ptr addrspace(1) @globalin |
| store <8 x i32> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_i64() { |
| ; PTX-LABEL: avar_i64( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; |
| ; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; |
| ; PTX-NEXT: ret; |
| %load = load <4 x i64>, ptr addrspace(1) @globalin |
| store <4 x i64> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_float() { |
| ; PTX-LABEL: avar_float( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <8 x float>, ptr addrspace(1) @globalin |
| store <8 x float> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @avar_double() { |
| ; PTX-LABEL: avar_double( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; |
| ; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; |
| ; PTX-NEXT: ret; |
| %load = load <4 x double>, ptr addrspace(1) @globalin |
| store <4 x double> %load, ptr addrspace(1) @globalout |
| ret void |
| } |
| |
| define void @asi_i8() { |
| ; PTX-LABEL: asi_i8( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <32 x i8>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <32 x i8> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_i16() { |
| ; PTX-LABEL: asi_i16( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <16 x i16>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <16 x i16> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_half() { |
| ; PTX-LABEL: asi_half( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <16 x half>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <16 x half> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_bfloat() { |
| ; PTX-LABEL: asi_bfloat( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <16 x bfloat>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <16 x bfloat> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_i32() { |
| ; PTX-LABEL: asi_i32( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <8 x i32>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <8 x i32> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_i64() { |
| ; PTX-LABEL: asi_i64( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; |
| ; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <4 x i64>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <4 x i64> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_float() { |
| ; PTX-LABEL: asi_float( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; |
| ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <8 x float>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <8 x float> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @asi_double() { |
| ; PTX-LABEL: asi_double( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<5>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; |
| ; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 |
| %load = load <4 x double>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 |
| store <4 x double> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_i8( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <32 x i8>, ptr addrspace(1) %in |
| store <32 x i8> %load, ptr addrspace(1) %out |
| ret void |
| } |
| define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_i16( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x i16>, ptr addrspace(1) %in |
| store <16 x i16> %load, ptr addrspace(1) %out |
| ret void |
| } |
| define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_half( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x half>, ptr addrspace(1) %in |
| store <16 x half> %load, ptr addrspace(1) %out |
| ret void |
| } |
| define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_bfloat( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <16 x bfloat>, ptr addrspace(1) %in |
| store <16 x bfloat> %load, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_i32( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <8 x i32>, ptr addrspace(1) %in |
| store <8 x i32> %load, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_i64( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<7>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0]; |
| ; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1]; |
| ; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; |
| ; PTX-NEXT: ret; |
| %load = load <4 x i64>, ptr addrspace(1) %in |
| store <4 x i64> %load, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_float( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %load = load <8 x float>, ptr addrspace(1) %in |
| store <8 x float> %load, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: areg_64_double( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<7>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0]; |
| ; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; |
| ; PTX-NEXT: ld.param.b64 %rd6, [areg_64_double_param_1]; |
| ; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; |
| ; PTX-NEXT: ret; |
| %load = load <4 x double>, ptr addrspace(1) %in |
| store <4 x double> %load, ptr addrspace(1) %out |
| ret void |
| } |
| |
| define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_i8( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <32 x i8>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <32 x i8> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_i16( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <16 x i16>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <16 x i16> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_half( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <16 x half>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <16 x half> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_bfloat( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <16 x bfloat>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <16 x bfloat> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_i32( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <8 x i32>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <8 x i32> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_i64( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<7>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1]; |
| ; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <4 x i64>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <4 x i64> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_float( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b32 %r<9>; |
| ; PTX-NEXT: .reg .b64 %rd<3>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; |
| ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <8 x float>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <8 x float> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |
| |
| define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { |
| ; PTX-LABEL: ari_64_double( |
| ; PTX: { |
| ; PTX-NEXT: .reg .b64 %rd<7>; |
| ; PTX-EMPTY: |
| ; PTX-NEXT: // %bb.0: |
| ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0]; |
| ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1]; |
| ; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; |
| ; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; |
| ; PTX-NEXT: ret; |
| %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 |
| %load = load <4 x double>, ptr addrspace(1) %in.offset |
| %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 |
| store <4 x double> %load, ptr addrspace(1) %out.offset |
| ret void |
| } |