blob: 2ffefd0cf461dabb581b35a3826c76c5ca82cec0 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
; TODO: fix "atomic store volatile release": generates "st.release.sys;"
; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
; TODO: add test for vectors that exceed 128-bit length
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
; TODO: generate PTX that preserves Concurrent Forward Progress
; for atomic operations to local statespace
; by generating atomic or volatile operations.
; TODO: design exposure for atomic operations on vector types.
; TODO: implement and test thread scope.
; TODO: add weak,atomic,volatile,atomic volatile tests
; for .const and .param statespaces.
; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
;; generic statespace
; CHECK-LABEL: generic_unordered_gpu
define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_unordered_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_unordered_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_unordered_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_unordered_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_unordered_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
%b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
%c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
%d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
%e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
%f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: generic_unordered_volatile_gpu
define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_unordered_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_gpu_param_0];
; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_unordered_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_unordered_volatile_gpu_param_2];
; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_unordered_volatile_gpu_param_3];
; CHECK-NEXT: ld.volatile.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_unordered_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
%e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
%f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: generic_unordered_cta
define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_unordered_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_unordered_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_unordered_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_unordered_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_unordered_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
%b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
%c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
%d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
%e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
%f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: generic_unordered_volatile_cta
define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_unordered_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_cta_param_0];
; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_unordered_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_unordered_volatile_cta_param_2];
; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_unordered_volatile_cta_param_3];
; CHECK-NEXT: ld.volatile.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_unordered_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
%e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
%f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: generic_monotonic_gpu
define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_monotonic_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_monotonic_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_monotonic_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_monotonic_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_monotonic_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
%b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
%c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
%d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
%e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
%f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: generic_monotonic_volatile_gpu
define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_monotonic_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_gpu_param_0];
; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_monotonic_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_monotonic_volatile_gpu_param_2];
; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_monotonic_volatile_gpu_param_3];
; CHECK-NEXT: ld.volatile.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_monotonic_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
%e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
%f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: generic_monotonic_cta
define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_monotonic_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_monotonic_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_monotonic_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_monotonic_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_monotonic_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
%b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
%c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
%d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
%e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
%f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: generic_monotonic_volatile_cta
define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_monotonic_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_cta_param_0];
; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_monotonic_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_monotonic_volatile_cta_param_2];
; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_monotonic_volatile_cta_param_3];
; CHECK-NEXT: ld.volatile.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_monotonic_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
%e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
%f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_sys
define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_sys_param_2];
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a release, align 1
%b.load = load atomic i16, ptr %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b release, align 2
%c.load = load atomic i32, ptr %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c release, align 4
%d.load = load atomic i64, ptr %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d release, align 8
%e.load = load atomic float, ptr %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e release, align 4
%f.load = load atomic double, ptr %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e release, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_volatile_sys
define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_volatile_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_volatile_sys_param_2];
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_volatile_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a release, align 1
%b.load = load atomic volatile i16, ptr %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b release, align 2
%c.load = load atomic volatile i32, ptr %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c release, align 4
%d.load = load atomic volatile i64, ptr %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d release, align 8
%e.load = load atomic volatile float, ptr %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e release, align 4
%f.load = load atomic volatile double, ptr %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e release, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_gpu
define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_gpu_param_0];
; CHECK-NEXT: ld.acquire.gpu.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_gpu_param_2];
; CHECK-NEXT: st.release.gpu.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_gpu_param_3];
; CHECK-NEXT: ld.acquire.gpu.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.gpu.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.gpu.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.gpu.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.gpu.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.gpu.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.gpu.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.gpu.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.gpu.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.gpu.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
%b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
%c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
%d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
%e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("device") release, align 4
%f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_volatile_gpu
define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_volatile_gpu_param_0];
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_volatile_gpu_param_2];
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_volatile_gpu_param_3];
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
%e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
%f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_cta
define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_cta_param_0];
; CHECK-NEXT: ld.acquire.cta.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_cta_param_2];
; CHECK-NEXT: st.release.cta.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_cta_param_3];
; CHECK-NEXT: ld.acquire.cta.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.cta.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.cta.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.cta.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.cta.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.cta.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.cta.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.cta.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.cta.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.cta.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
%b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
%c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
%d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
%e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("block") release, align 4
%f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: generic_acq_rel_volatile_cta
define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_acq_rel_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_acq_rel_volatile_cta_param_0];
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_acq_rel_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_acq_rel_volatile_cta_param_2];
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_acq_rel_volatile_cta_param_3];
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_acq_rel_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
%e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
%f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: generic_sc_sys
define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a seq_cst, align 1
%b.load = load atomic i16, ptr %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b seq_cst, align 2
%c.load = load atomic i32, ptr %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c seq_cst, align 4
%d.load = load atomic i64, ptr %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d seq_cst, align 8
%e.load = load atomic float, ptr %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e seq_cst, align 4
%f.load = load atomic double, ptr %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e seq_cst, align 8
ret void
}
; CHECK-LABEL: generic_sc_volatile_sys
define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_volatile_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_volatile_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_volatile_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a seq_cst, align 1
%b.load = load atomic volatile i16, ptr %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b seq_cst, align 2
%c.load = load atomic volatile i32, ptr %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c seq_cst, align 4
%d.load = load atomic volatile i64, ptr %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d seq_cst, align 8
%e.load = load atomic volatile float, ptr %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e seq_cst, align 4
%f.load = load atomic volatile double, ptr %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e seq_cst, align 8
ret void
}
; CHECK-LABEL: generic_sc_gpu
define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_gpu_param_0];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_gpu_param_2];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_gpu_param_3];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
%b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
%c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
%d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
%e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
%f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: generic_sc_volatile_gpu
define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_volatile_gpu_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_volatile_gpu_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_volatile_gpu_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
%e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
%f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: generic_sc_cta
define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_cta_param_0];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_cta_param_2];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_cta_param_3];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
%b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
%c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
%d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
%e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
%f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
; CHECK-LABEL: generic_sc_volatile_cta
define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK-LABEL: generic_sc_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [generic_sc_volatile_cta_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [generic_sc_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [generic_sc_volatile_cta_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [generic_sc_volatile_cta_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [generic_sc_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
%b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
%c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
%d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
%e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
%f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
;; global statespace
; CHECK-LABEL: global_unordered_gpu
define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_unordered_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_unordered_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_unordered_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_unordered_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_unordered_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_unordered_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: global_unordered_volatile_gpu
define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_unordered_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_gpu_param_0];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_unordered_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_unordered_volatile_gpu_param_2];
; CHECK-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_unordered_volatile_gpu_param_3];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_unordered_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: global_unordered_cta
define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_unordered_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_unordered_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_unordered_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_unordered_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_unordered_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_unordered_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: global_unordered_volatile_cta
define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_unordered_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_cta_param_0];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_unordered_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_unordered_volatile_cta_param_2];
; CHECK-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_unordered_volatile_cta_param_3];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_unordered_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: global_monotonic_gpu
define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_monotonic_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_monotonic_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_monotonic_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_monotonic_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_monotonic_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_monotonic_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: global_monotonic_volatile_gpu
define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_monotonic_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_gpu_param_0];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_monotonic_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_monotonic_volatile_gpu_param_2];
; CHECK-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_monotonic_volatile_gpu_param_3];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_monotonic_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: global_monotonic_cta
define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_monotonic_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_monotonic_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_monotonic_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_monotonic_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_monotonic_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_monotonic_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: global_monotonic_volatile_cta
define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_monotonic_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_cta_param_0];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_monotonic_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_monotonic_volatile_cta_param_2];
; CHECK-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_monotonic_volatile_cta_param_3];
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_monotonic_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.mmio.relaxed.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.mmio.relaxed.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_sys
define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_sys_param_2];
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
%b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
%c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
%d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
%e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e release, align 4
%f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e release, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_volatile_sys
define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_volatile_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_volatile_sys_param_2];
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_volatile_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_gpu
define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_gpu_param_0];
; CHECK-NEXT: ld.acquire.gpu.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_gpu_param_2];
; CHECK-NEXT: st.release.gpu.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_gpu_param_3];
; CHECK-NEXT: ld.acquire.gpu.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.gpu.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.gpu.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.gpu.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.gpu.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.gpu.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.gpu.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.gpu.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.gpu.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.gpu.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_volatile_gpu
define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_volatile_gpu_param_0];
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_volatile_gpu_param_2];
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_volatile_gpu_param_3];
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_cta
define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_cta_param_0];
; CHECK-NEXT: ld.acquire.cta.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_cta_param_2];
; CHECK-NEXT: st.release.cta.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_cta_param_3];
; CHECK-NEXT: ld.acquire.cta.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.cta.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.cta.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.cta.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.cta.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.cta.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.cta.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.cta.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.cta.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.cta.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: global_acq_rel_volatile_cta
define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_acq_rel_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_acq_rel_volatile_cta_param_0];
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_acq_rel_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_acq_rel_volatile_cta_param_2];
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_acq_rel_volatile_cta_param_3];
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_acq_rel_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_sys
define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
%e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
%f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_volatile_sys
define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_volatile_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_volatile_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_volatile_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_gpu
define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_gpu_param_0];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_gpu_param_2];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_gpu_param_3];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_volatile_gpu
define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_volatile_gpu_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_volatile_gpu_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_volatile_gpu_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_cta
define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_cta_param_0];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_cta_param_2];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_cta_param_3];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
ret void
}
; CHECK-LABEL: global_seq_cst_volatile_cta
define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK-LABEL: global_seq_cst_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [global_seq_cst_volatile_cta_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [global_seq_cst_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [global_seq_cst_volatile_cta_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [global_seq_cst_volatile_cta_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [global_seq_cst_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.global.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.global.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
ret void
}
;; shared statespace
; CHECK-LABEL: shared_unordered_gpu
define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_unordered_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_unordered_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_unordered_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_unordered_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_unordered_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: shared_unordered_volatile_gpu
define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_unordered_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_gpu_param_0];
; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_unordered_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_unordered_volatile_gpu_param_2];
; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_unordered_volatile_gpu_param_3];
; CHECK-NEXT: ld.volatile.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_unordered_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: shared_unordered_cta
define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_unordered_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_unordered_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_unordered_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_unordered_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_unordered_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: shared_unordered_volatile_cta
define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_unordered_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_cta_param_0];
; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_unordered_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_unordered_volatile_cta_param_2];
; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_unordered_volatile_cta_param_3];
; CHECK-NEXT: ld.volatile.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_unordered_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: shared_monotonic_gpu
define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_monotonic_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_gpu_param_0];
; CHECK-NEXT: ld.relaxed.gpu.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_monotonic_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_monotonic_gpu_param_2];
; CHECK-NEXT: st.relaxed.gpu.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_monotonic_gpu_param_3];
; CHECK-NEXT: ld.relaxed.gpu.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_monotonic_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.gpu.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.gpu.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.gpu.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.gpu.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.gpu.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.gpu.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.gpu.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: shared_monotonic_volatile_gpu
define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_monotonic_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_gpu_param_0];
; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_monotonic_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_monotonic_volatile_gpu_param_2];
; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_monotonic_volatile_gpu_param_3];
; CHECK-NEXT: ld.volatile.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_monotonic_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: shared_monotonic_cta
define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_monotonic_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_cta_param_0];
; CHECK-NEXT: ld.relaxed.cta.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_monotonic_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_monotonic_cta_param_2];
; CHECK-NEXT: st.relaxed.cta.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_monotonic_cta_param_3];
; CHECK-NEXT: ld.relaxed.cta.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_monotonic_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.relaxed.cta.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.relaxed.cta.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.relaxed.cta.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.relaxed.cta.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.relaxed.cta.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.relaxed.cta.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.relaxed.cta.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: shared_monotonic_volatile_cta
define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_monotonic_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_cta_param_0];
; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_monotonic_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_monotonic_volatile_cta_param_2];
; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_monotonic_volatile_cta_param_3];
; CHECK-NEXT: ld.volatile.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_monotonic_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.volatile.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.volatile.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.volatile.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.volatile.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.volatile.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.volatile.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.volatile.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.volatile.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_sys
define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_sys_param_2];
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
%b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
%c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
%d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
%e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e release, align 4
%f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e release, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_volatile_sys
define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_volatile_sys_param_0];
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_volatile_sys_param_2];
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_volatile_sys_param_3];
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_gpu
define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_gpu_param_0];
; CHECK-NEXT: ld.acquire.gpu.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_gpu_param_2];
; CHECK-NEXT: st.release.gpu.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_gpu_param_3];
; CHECK-NEXT: ld.acquire.gpu.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.gpu.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.gpu.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.gpu.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.gpu.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.gpu.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.gpu.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.gpu.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.gpu.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.gpu.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_volatile_gpu
define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_volatile_gpu_param_0];
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_volatile_gpu_param_2];
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_volatile_gpu_param_3];
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_cta
define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_cta_param_0];
; CHECK-NEXT: ld.acquire.cta.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_cta_param_2];
; CHECK-NEXT: st.release.cta.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_cta_param_3];
; CHECK-NEXT: ld.acquire.cta.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.cta.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.cta.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.cta.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.cta.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.cta.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.cta.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.cta.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.cta.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.cta.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: shared_acq_rel_volatile_cta
define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_acq_rel_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_acq_rel_volatile_cta_param_0];
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_acq_rel_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_acq_rel_volatile_cta_param_2];
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_acq_rel_volatile_cta_param_3];
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_acq_rel_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_sys
define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
%e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
%f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_volatile_sys
define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_volatile_sys_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_volatile_sys_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_volatile_sys_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_gpu
define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_gpu_param_0];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_gpu_param_2];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_gpu_param_3];
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: ld.acquire.gpu.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.gpu;
; CHECK-NEXT: st.release.gpu.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_volatile_gpu
define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_volatile_gpu_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_volatile_gpu_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_volatile_gpu_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_cta
define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_cta_param_0];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_cta_param_2];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_cta_param_3];
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: ld.acquire.cta.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.cta;
; CHECK-NEXT: st.release.cta.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
ret void
}
; CHECK-LABEL: shared_seq_cst_volatile_cta
define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK-LABEL: shared_seq_cst_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [shared_seq_cst_volatile_cta_param_0];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [shared_seq_cst_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [shared_seq_cst_volatile_cta_param_2];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [shared_seq_cst_volatile_cta_param_3];
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [shared_seq_cst_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b16 [%rd2], %rs4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd3], %r2;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd4], %rd7;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b32 [%rd5], %r4;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.acquire.sys.shared.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: st.release.sys.shared.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
ret void
}
;; local statespace
; CHECK-LABEL: local_unordered_gpu
define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_unordered_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_unordered_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_unordered_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_unordered_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_unordered_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: local_unordered_volatile_gpu
define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_unordered_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_unordered_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_unordered_volatile_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_unordered_volatile_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_unordered_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
ret void
}
; CHECK-LABEL: local_unordered_cta
define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_unordered_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_unordered_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_unordered_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_unordered_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_unordered_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: local_unordered_volatile_cta
define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_unordered_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_unordered_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_unordered_volatile_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_unordered_volatile_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_unordered_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
ret void
}
; CHECK-LABEL: local_monotonic_gpu
define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_monotonic_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_monotonic_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_monotonic_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_monotonic_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_monotonic_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: local_monotonic_volatile_gpu
define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_monotonic_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_monotonic_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_monotonic_volatile_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_monotonic_volatile_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_monotonic_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
ret void
}
; CHECK-LABEL: local_monotonic_cta
define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_monotonic_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_monotonic_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_monotonic_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_monotonic_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_monotonic_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: local_monotonic_volatile_cta
define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_monotonic_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_monotonic_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_monotonic_volatile_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_monotonic_volatile_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_monotonic_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_sys
define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_sys_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_sys_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_sys_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
%b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
%c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
%d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
%e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e release, align 4
%f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e release, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_volatile_sys
define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_volatile_sys_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_volatile_sys_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_volatile_sys_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_gpu
define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_volatile_gpu
define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_volatile_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_volatile_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_volatile_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_cta
define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: local_acq_rel_volatile_cta
define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_acq_rel_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_acq_rel_volatile_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_acq_rel_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_acq_rel_volatile_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_acq_rel_volatile_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_acq_rel_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_sys
define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_sys_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_sys_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_sys_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
%e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
%f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_volatile_sys
define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_volatile_sys(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_volatile_sys_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_volatile_sys_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_volatile_sys_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_volatile_sys_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_volatile_sys_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_gpu
define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_volatile_gpu
define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_volatile_gpu(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_volatile_gpu_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_volatile_gpu_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_volatile_gpu_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_volatile_gpu_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_volatile_gpu_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_cta
define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
%b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
%c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
%d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
%e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
%f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
ret void
}
; CHECK-LABEL: local_seq_cst_volatile_cta
define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_seq_cst_volatile_cta(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_seq_cst_volatile_cta_param_0];
; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.param.b64 %rd2, [local_seq_cst_volatile_cta_param_1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: ld.param.b64 %rd3, [local_seq_cst_volatile_cta_param_2];
; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ld.param.b64 %rd4, [local_seq_cst_volatile_cta_param_3];
; CHECK-NEXT: ld.local.b16 %rs3, [%rd2];
; CHECK-NEXT: ld.param.b64 %rd5, [local_seq_cst_volatile_cta_param_4];
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: st.local.b16 [%rd2], %rs4;
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
; CHECK-NEXT: st.local.b32 [%rd3], %r2;
; CHECK-NEXT: ld.local.b64 %rd6, [%rd4];
; CHECK-NEXT: add.s64 %rd7, %rd6, 1;
; CHECK-NEXT: st.local.b64 [%rd4], %rd7;
; CHECK-NEXT: ld.local.b32 %r3, [%rd5];
; CHECK-NEXT: add.rn.f32 %r4, %r3, 0f3F800000;
; CHECK-NEXT: st.local.b32 [%rd5], %r4;
; CHECK-NEXT: ld.local.b64 %rd8, [%rd5];
; CHECK-NEXT: add.rn.f64 %rd9, %rd8, 0d3FF0000000000000;
; CHECK-NEXT: st.local.b64 [%rd5], %rd9;
; CHECK-NEXT: ret;
%a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
%b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
%c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
%d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
%e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
%e.add = fadd float %e.load, 1.
store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
%f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
ret void
}