blob: a233616563085984fae9477c0b43b5f89d39d6f9 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
target triple = "nvptx64-nvidia-cuda"
declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32)
declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr)
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
declare ptr @llvm.nvvm.mapa(ptr, i32)
; Common setup for distributed shared memory cluster addressing
define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %smem_ptr) local_unnamed_addr {
; CHECK-LABEL: test_distributed_shared_cluster_common(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b32 %r<8>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_common_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_common_param_1];
; CHECK-NEXT: mov.u32 %r1, %ctaid.x;
; CHECK-NEXT: xor.b32 %r2, %r1, 1;
; CHECK-NEXT: isspacep.shared::cluster %p1, %rd1;
; CHECK-NEXT: mapa.u64 %rd3, %rd1, %r2;
; CHECK-NEXT: isspacep.shared::cluster %p2, %rd3;
; CHECK-NEXT: mapa.shared::cluster.u64 %rd4, %rd2, %r2;
; CHECK-NEXT: ld.shared::cluster.u32 %r3, [%rd4];
; CHECK-NEXT: add.s32 %r4, %r3, 42;
; CHECK-NEXT: st.shared::cluster.u32 [%rd4], %r4;
; CHECK-NEXT: selp.b32 %r5, 1, 0, %p1;
; CHECK-NEXT: selp.b32 %r6, 1, 0, %p2;
; CHECK-NEXT: add.s32 %r7, %r5, %r6;
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NEXT: ret;
entry:
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%1 = xor i32 %0, 1
%2 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %ptr)
%3 = tail call ptr @llvm.nvvm.mapa(ptr %ptr, i32 %1)
%4 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %3)
%dsmem_ptr = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %smem_ptr, i32 %1)
; Add load and store to the distributed shared memory cluster
%loaded_val = load i32, ptr addrspace(7) %dsmem_ptr
%updated_val = add i32 %loaded_val, 42
store i32 %updated_val, ptr addrspace(7) %dsmem_ptr
; Return value preserves the isspacep test results plus the value operation
%5 = zext i1 %2 to i32
%6 = zext i1 %4 to i32
%ret = add i32 %5, %6
ret i32 %ret
}
; Floating point atomic operations tests
define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
; CHECK-LABEL: test_distributed_shared_cluster_float_atomic(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %f<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-NEXT: .reg .b64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
; CHECK-NEXT: mov.b16 %rs1, 0x3C00;
; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3;
; CHECK-NEXT: atom.shared::cluster.add.f32 %f1, [%rd1], 0f3F800000;
; CHECK-NEXT: atom.shared::cluster.add.f64 %fd1, [%rd1], 0d3FF0000000000000;
; CHECK-NEXT: ret;
entry:
; Floating point atomic operations
%0 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, half 1.000000e+00 seq_cst
%1 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, bfloat 1.000000e+00 seq_cst
%2 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, float 1.000000e+00 seq_cst
%3 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, double 1.000000e+00 seq_cst
ret void
}
; Integer atomic operations tests
define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
; CHECK-LABEL: test_distributed_shared_cluster_int_atomic(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd3, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.min.s32 %r3, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.min.s64 %rd4, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.min.u32 %r4, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.min.u64 %rd5, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.max.s32 %r5, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.max.s64 %rd6, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.max.u32 %r6, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.max.u64 %rd7, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.inc.u32 %r7, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.dec.u32 %r8, [%rd1], 1;
; CHECK-NEXT: ret;
entry:
; Integer add operations
%0 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%1 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
; Exchange operations
%2 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%3 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
; Min operations (signed and unsigned)
%4 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%5 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
%6 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%7 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
; Max operations (signed and unsigned)
%8 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%9 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
%10 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%11 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
; Inc/Dec operations (32-bit only)
%12 = atomicrmw uinc_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%13 = atomicrmw udec_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
ret void
}
; Bitwise atomic operations tests
define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
; CHECK-LABEL: test_distributed_shared_cluster_bitwise_atomic(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.or.b64 %rd3, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.xor.b32 %r3, [%rd1], 1;
; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd4, [%rd1], 1;
; CHECK-NEXT: ret;
entry:
; Bitwise operations
%0 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%1 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
%2 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%3 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
%4 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
%5 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
ret void
}
; Compare-exchange operations tests
define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<11>;
; CHECK-NEXT: .reg .b32 %r<53>;
; CHECK-NEXT: .reg .b64 %rd<12>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
; CHECK-NEXT: and.b64 %rd1, %rd2, -4;
; CHECK-NEXT: cvt.u32.u64 %r33, %rd2;
; CHECK-NEXT: and.b32 %r34, %r33, 3;
; CHECK-NEXT: shl.b32 %r1, %r34, 3;
; CHECK-NEXT: mov.b32 %r35, 65535;
; CHECK-NEXT: shl.b32 %r36, %r35, %r1;
; CHECK-NEXT: not.b32 %r2, %r36;
; CHECK-NEXT: mov.b32 %r37, 1;
; CHECK-NEXT: shl.b32 %r3, %r37, %r1;
; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1];
; CHECK-NEXT: and.b32 %r48, %r38, %r2;
; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or.b32 %r39, %r48, %r3;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
; CHECK-NEXT: setp.eq.s32 %p1, %r6, %r39;
; CHECK-NEXT: @%p1 bra $L__BB4_3;
; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32
; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1
; CHECK-NEXT: and.b32 %r7, %r6, %r2;
; CHECK-NEXT: setp.ne.s32 %p2, %r48, %r7;
; CHECK-NEXT: mov.b32 %r48, %r7;
; CHECK-NEXT: @%p2 bra $L__BB4_1;
; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31
; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1];
; CHECK-NEXT: and.b32 %r49, %r40, %r2;
; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or.b32 %r41, %r49, %r3;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
; CHECK-NEXT: setp.eq.s32 %p3, %r10, %r41;
; CHECK-NEXT: @%p3 bra $L__BB4_6;
; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22
; CHECK-NEXT: // in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: and.b32 %r11, %r10, %r2;
; CHECK-NEXT: setp.ne.s32 %p4, %r49, %r11;
; CHECK-NEXT: mov.b32 %r49, %r11;
; CHECK-NEXT: @%p4 bra $L__BB4_4;
; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21
; CHECK-NEXT: fence.acq_rel.sys;
; CHECK-NEXT: fence.acq_rel.sys;
; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1];
; CHECK-NEXT: and.b32 %r50, %r42, %r2;
; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or.b32 %r43, %r50, %r3;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
; CHECK-NEXT: setp.eq.s32 %p5, %r14, %r43;
; CHECK-NEXT: @%p5 bra $L__BB4_9;
; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12
; CHECK-NEXT: // in Loop: Header=BB4_7 Depth=1
; CHECK-NEXT: and.b32 %r15, %r14, %r2;
; CHECK-NEXT: setp.ne.s32 %p6, %r50, %r15;
; CHECK-NEXT: mov.b32 %r50, %r15;
; CHECK-NEXT: @%p6 bra $L__BB4_7;
; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11
; CHECK-NEXT: fence.acq_rel.sys;
; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1];
; CHECK-NEXT: and.b32 %r51, %r44, %r2;
; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or.b32 %r45, %r51, %r3;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
; CHECK-NEXT: setp.eq.s32 %p7, %r18, %r45;
; CHECK-NEXT: @%p7 bra $L__BB4_12;
; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2
; CHECK-NEXT: // in Loop: Header=BB4_10 Depth=1
; CHECK-NEXT: and.b32 %r19, %r18, %r2;
; CHECK-NEXT: setp.ne.s32 %p8, %r51, %r19;
; CHECK-NEXT: mov.b32 %r51, %r19;
; CHECK-NEXT: @%p8 bra $L__BB4_10;
; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1
; CHECK-NEXT: fence.acq_rel.sys;
; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1];
; CHECK-NEXT: and.b32 %r52, %r46, %r2;
; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or.b32 %r47, %r52, %r3;
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
; CHECK-NEXT: setp.eq.s32 %p9, %r22, %r47;
; CHECK-NEXT: @%p9 bra $L__BB4_15;
; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure
; CHECK-NEXT: // in Loop: Header=BB4_13 Depth=1
; CHECK-NEXT: and.b32 %r23, %r22, %r2;
; CHECK-NEXT: setp.ne.s32 %p10, %r52, %r23;
; CHECK-NEXT: mov.b32 %r52, %r23;
; CHECK-NEXT: @%p10 bra $L__BB4_13;
; CHECK-NEXT: $L__BB4_15: // %partword.cmpxchg.end
; CHECK-NEXT: fence.acq_rel.sys;
; CHECK-NEXT: ret;
entry:
; Compare-exchange operation - all memory ordering combinations for 32-bit
%0 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 monotonic monotonic
%1 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire monotonic
%2 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire acquire
%3 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 release monotonic
%4 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel monotonic
%5 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel acquire
%6 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst monotonic
%7 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst acquire
%8 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst seq_cst
; Compare-exchange operation - all memory ordering combinations for 64-bit
%9 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 monotonic monotonic
%10 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire monotonic
%11 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire acquire
%12 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 release monotonic
%13 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel monotonic
%14 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel acquire
%15 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst monotonic
%16 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst acquire
%17 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst seq_cst
; Compare-exchange operation - 16-bit
%18 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 monotonic monotonic
%19 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acquire acquire
%20 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 release monotonic
%21 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acq_rel acquire
%22 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 seq_cst seq_cst
ret void
}