blob: b5319935f0f9d94484dc1ed33d9b9c4f10e35a79 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100
; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
; This test is based on load-store-vectors.ll,
; and contains testing for lowering 256-bit vector loads/stores
; Types we are checking: i8, i16, half, bfloat, i32, i64, f32, f64
; Address spaces we are checking: generic, global, shared, local
; - Global is the only address space that currently supports 256-bit/v8 loads/stores,
; the other cases will legalize by splitting to smaller vectors
; 256-bit vector loads/stores are only legal for blackwell+, so on sm_90, the vectors will be split
; Types of loads/stores we are checking: normal, volatile
; - No need to check atomic loads/stores (monotonic and unordered) like load-store-vectors.ll checks,
; because those only work with non-vectors
;; generic statespace
; generic
define void @generic_32xi8(ptr %a, ptr %b) {
; CHECK-LABEL: generic_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_32xi8_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_32xi8_param_1];
; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <32 x i8>, ptr %a
store <32 x i8> %a.load, ptr %b
ret void
}
define void @generic_16xi16(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xi16_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xi16_param_1];
; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x i16>, ptr %a
store <16 x i16> %a.load, ptr %b
ret void
}
define void @generic_16xhalf(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xhalf_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xhalf_param_1];
; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x half>, ptr %a
store <16 x half> %a.load, ptr %b
ret void
}
define void @generic_16xbfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xbfloat_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xbfloat_param_1];
; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x bfloat>, ptr %a
store <16 x bfloat> %a.load, ptr %b
ret void
}
define void @generic_8xi32(ptr %a, ptr %b) {
; SM90-LABEL: generic_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1];
; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: generic_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0];
; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [generic_8xi32_param_1];
; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr %a
store <8 x i32> %a.load, ptr %b
ret void
}
define void @generic_4xi64(ptr %a, ptr %b) {
; CHECK-LABEL: generic_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi64_param_0];
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xi64_param_1];
; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x i64>, ptr %a
store <4 x i64> %a.load, ptr %b
ret void
}
define void @generic_8xfloat(ptr %a, ptr %b) {
; SM90-LABEL: generic_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1];
; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: generic_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0];
; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1];
; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr %a
store <8 x float> %a.load, ptr %b
ret void
}
define void @generic_4xdouble(ptr %a, ptr %b) {
; CHECK-LABEL: generic_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xdouble_param_0];
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xdouble_param_1];
; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x double>, ptr %a
store <4 x double> %a.load, ptr %b
ret void
}
; generic_volatile
define void @generic_volatile_32xi8(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_32xi8_param_0];
; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_32xi8_param_1];
; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <32 x i8>, ptr %a
store volatile <32 x i8> %a.load, ptr %b
ret void
}
define void @generic_volatile_16xi16(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xi16_param_0];
; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xi16_param_1];
; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x i16>, ptr %a
store volatile <16 x i16> %a.load, ptr %b
ret void
}
define void @generic_volatile_16xhalf(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xhalf_param_0];
; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xhalf_param_1];
; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x half>, ptr %a
store volatile <16 x half> %a.load, ptr %b
ret void
}
define void @generic_volatile_16xbfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xbfloat_param_0];
; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xbfloat_param_1];
; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x bfloat>, ptr %a
store volatile <16 x bfloat> %a.load, ptr %b
ret void
}
define void @generic_volatile_8xi32(ptr %a, ptr %b) {
; SM90-LABEL: generic_volatile_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1];
; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: generic_volatile_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xi32_param_1];
; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr %a
store volatile <8 x i32> %a.load, ptr %b
ret void
}
define void @generic_volatile_4xi64(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi64_param_0];
; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_4xi64_param_1];
; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i64>, ptr %a
store volatile <4 x i64> %a.load, ptr %b
ret void
}
define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
; SM90-LABEL: generic_volatile_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1];
; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: generic_volatile_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr %a
store volatile <8 x float> %a.load, ptr %b
ret void
}
define void @generic_volatile_4xdouble(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xdouble_param_0];
; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_4xdouble_param_1];
; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x double>, ptr %a
store volatile <4 x double> %a.load, ptr %b
ret void
}
;; global statespace
; global
define void @global_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_32xi8(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_32xi8_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_32xi8_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_32xi8(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_32xi8_param_0];
; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_32xi8_param_1];
; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load <32 x i8>, ptr addrspace(1) %a
store <32 x i8> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xi16(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_16xi16(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0];
; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1];
; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load <16 x i16>, ptr addrspace(1) %a
store <16 x i16> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xhalf(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_16xhalf_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_16xhalf_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_16xhalf(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_16xhalf_param_0];
; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_16xhalf_param_1];
; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load <16 x half>, ptr addrspace(1) %a
store <16 x half> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xbfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_16xbfloat_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_16xbfloat_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_16xbfloat(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_16xbfloat_param_0];
; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_16xbfloat_param_1];
; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load <16 x bfloat>, ptr addrspace(1) %a
store <16 x bfloat> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_param_1];
; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(1) %a
store <8 x i32> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_4xi64(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0];
; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1];
; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_4xi64(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0];
; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1];
; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load <4 x i64>, ptr addrspace(1) %a
store <4 x i64> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1];
; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0];
; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1];
; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr addrspace(1) %a
store <8 x float> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_4xdouble(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0];
; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1];
; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_4xdouble(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0];
; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_4xdouble_param_1];
; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load <4 x double>, ptr addrspace(1) %a
store <4 x double> %a.load, ptr addrspace(1) %b
ret void
}
; global_volatile
define void @global_volatile_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_32xi8(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_32xi8_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_32xi8_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_32xi8(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_32xi8_param_0];
; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_32xi8_param_1];
; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load volatile <32 x i8>, ptr addrspace(1) %a
store volatile <32 x i8> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xi16(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xi16_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xi16_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_16xi16(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xi16_param_0];
; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xi16_param_1];
; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load volatile <16 x i16>, ptr addrspace(1) %a
store volatile <16 x i16> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xhalf(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xhalf_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xhalf_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_16xhalf(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xhalf_param_0];
; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xhalf_param_1];
; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load volatile <16 x half>, ptr addrspace(1) %a
store volatile <16 x half> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xbfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_16xbfloat(
; SM100: {
; SM100-NEXT: .reg .b32 %r<9>;
; SM100-NEXT: .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0];
; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1];
; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT: ret;
%a.load = load volatile <16 x bfloat>, ptr addrspace(1) %a
store volatile <16 x bfloat> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0];
; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xi32_param_1];
; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(1) %a
store volatile <8 x i32> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_4xi64(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_4xi64_param_0];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_4xi64_param_1];
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_4xi64(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_4xi64_param_0];
; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_4xi64_param_1];
; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile <4 x i64>, ptr addrspace(1) %a
store volatile <4 x i64> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1];
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(1) %a
store volatile <8 x float> %a.load, ptr addrspace(1) %b
ret void
}
define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_4xdouble(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: global_volatile_4xdouble(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile <4 x double>, ptr addrspace(1) %a
store volatile <4 x double> %a.load, ptr addrspace(1) %b
ret void
}
;; shared statespace
; shared
define void @shared_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_32xi8_param_0];
; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_32xi8_param_1];
; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <32 x i8>, ptr addrspace(3) %a
store <32 x i8> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xi16_param_0];
; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xi16_param_1];
; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x i16>, ptr addrspace(3) %a
store <16 x i16> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xhalf_param_0];
; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xhalf_param_1];
; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x half>, ptr addrspace(3) %a
store <16 x half> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xbfloat_param_0];
; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xbfloat_param_1];
; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x bfloat>, ptr addrspace(3) %a
store <16 x bfloat> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; SM90-LABEL: shared_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0];
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1];
; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: shared_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0];
; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [shared_8xi32_param_1];
; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(3) %a
store <8 x i32> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi64_param_0];
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [shared_4xi64_param_1];
; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x i64>, ptr addrspace(3) %a
store <4 x i64> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; SM90-LABEL: shared_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1];
; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: shared_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0];
; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1];
; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr addrspace(3) %a
store <8 x float> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xdouble_param_0];
; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [shared_4xdouble_param_1];
; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x double>, ptr addrspace(3) %a
store <4 x double> %a.load, ptr addrspace(3) %b
ret void
}
; shared_volatile
define void @shared_volatile_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_32xi8_param_0];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_32xi8_param_1];
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <32 x i8>, ptr addrspace(3) %a
store volatile <32 x i8> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xi16_param_0];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xi16_param_1];
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x i16>, ptr addrspace(3) %a
store volatile <16 x i16> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xhalf_param_0];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xhalf_param_1];
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x half>, ptr addrspace(3) %a
store volatile <16 x half> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xbfloat_param_0];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xbfloat_param_1];
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x bfloat>, ptr addrspace(3) %a
store volatile <16 x bfloat> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; SM90-LABEL: shared_volatile_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1];
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: shared_volatile_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xi32_param_1];
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(3) %a
store volatile <8 x i32> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi64_param_0];
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_4xi64_param_1];
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i64>, ptr addrspace(3) %a
store volatile <4 x i64> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; SM90-LABEL: shared_volatile_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1];
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: shared_volatile_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(3) %a
store volatile <8 x float> %a.load, ptr addrspace(3) %b
ret void
}
define void @shared_volatile_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xdouble_param_0];
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_4xdouble_param_1];
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x double>, ptr addrspace(3) %a
store volatile <4 x double> %a.load, ptr addrspace(3) %b
ret void
}
;; local statespace
; local
define void @local_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_32xi8_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_32xi8_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <32 x i8>, ptr addrspace(5) %a
store <32 x i8> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_16xi16_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_16xi16_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x i16>, ptr addrspace(5) %a
store <16 x i16> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_16xhalf_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_16xhalf_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x half>, ptr addrspace(5) %a
store <16 x half> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_16xbfloat_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_16xbfloat_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load <16 x bfloat>, ptr addrspace(5) %a
store <16 x bfloat> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; SM90-LABEL: local_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0];
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1];
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: local_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0];
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [local_8xi32_param_1];
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x i32>, ptr addrspace(5) %a
store <8 x i32> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi64_param_0];
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_4xi64_param_1];
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x i64>, ptr addrspace(5) %a
store <4 x i64> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; SM90-LABEL: local_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1];
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: local_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0];
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1];
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load <8 x float>, ptr addrspace(5) %a
store <8 x float> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_4xdouble_param_0];
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_4xdouble_param_1];
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load <4 x double>, ptr addrspace(5) %a
store <4 x double> %a.load, ptr addrspace(5) %b
ret void
}
; local_volatile
define void @local_volatile_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_32xi8(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_32xi8_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_32xi8_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <32 x i8>, ptr addrspace(5) %a
store volatile <32 x i8> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xi16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi16_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xi16_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x i16>, ptr addrspace(5) %a
store volatile <16 x i16> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xhalf(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xhalf_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xhalf_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x half>, ptr addrspace(5) %a
store volatile <16 x half> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xbfloat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xbfloat_param_0];
; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xbfloat_param_1];
; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x bfloat>, ptr addrspace(5) %a
store volatile <16 x bfloat> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; SM90-LABEL: local_volatile_8xi32(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1];
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: local_volatile_8xi32(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xi32_param_1];
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(5) %a
store volatile <8 x i32> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_4xi64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi64_param_0];
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_4xi64_param_1];
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i64>, ptr addrspace(5) %a
store volatile <4 x i64> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; SM90-LABEL: local_volatile_8xfloat(
; SM90: {
; SM90-NEXT: .reg .b32 %r<9>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1];
; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT: ret;
;
; SM100-LABEL: local_volatile_8xfloat(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; SM100-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(5) %a
store volatile <8 x float> %a.load, ptr addrspace(5) %b
ret void
}
define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_4xdouble(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xdouble_param_0];
; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_4xdouble_param_1];
; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x double>, ptr addrspace(5) %a
store volatile <4 x double> %a.load, ptr addrspace(5) %b
ret void
}
define void @test_i256_global(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: test_i256_global(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0];
; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1];
; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: test_i256_global(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_param_0];
; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_param_1];
; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load i256, ptr addrspace(1) %a, align 32
store i256 %a.load, ptr addrspace(1) %b, align 32
ret void
}
define void @test_i256_global_unaligned(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; CHECK-LABEL: test_i256_global_unaligned(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_global_unaligned_param_0];
; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_global_unaligned_param_1];
; CHECK-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load i256, ptr addrspace(1) %a, align 16
store i256 %a.load, ptr addrspace(1) %b, align 16
ret void
}
define void @test_i256_generic(ptr %a, ptr %b) {
; CHECK-LABEL: test_i256_generic(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_i256_generic_param_0];
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [test_i256_generic_param_1];
; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load i256, ptr %a, align 32
store i256 %a.load, ptr %b, align 32
ret void
}
define void @test_i256_global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: test_i256_global_volatile(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1];
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT: ret;
;
; SM100-LABEL: test_i256_global_volatile(
; SM100: {
; SM100-NEXT: .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT: // %bb.0:
; SM100-NEXT: ld.param.b64 %rd1, [test_i256_global_volatile_param_0];
; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT: ld.param.b64 %rd6, [test_i256_global_volatile_param_1];
; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT: ret;
%a.load = load volatile i256, ptr addrspace(1) %a, align 32
store volatile i256 %a.load, ptr addrspace(1) %b, align 32
ret void
}