| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 |
| ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK |
| ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64 |
| ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71 |
| ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} |
| ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} |
| ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %} |
| |
| target triple = "nvptx64-nvidia-cuda" |
| |
| define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) { |
| ; CHECK-LABEL: test( |
| ; CHECK: { |
| ; CHECK-NEXT: .reg .b16 %rs<7>; |
| ; CHECK-NEXT: .reg .b32 %r<4>; |
| ; CHECK-EMPTY: |
| ; CHECK-NEXT: // %bb.0: |
| ; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; |
| ; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; |
| ; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1; |
| ; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; |
| ; CHECK-NEXT: mov.b16 %rs3, 0x3F80; |
| ; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3; |
| ; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; |
| ; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1; |
| ; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1; |
| ; CHECK-NEXT: ret; |
| ; |
| ; CHECK64-LABEL: test( |
| ; CHECK64: { |
| ; CHECK64-NEXT: .reg .b16 %rs<7>; |
| ; CHECK64-NEXT: .reg .b64 %rd<4>; |
| ; CHECK64-EMPTY: |
| ; CHECK64-NEXT: // %bb.0: |
| ; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; |
| ; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; |
| ; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1; |
| ; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; |
| ; CHECK64-NEXT: mov.b16 %rs3, 0x3F80; |
| ; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3; |
| ; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; |
| ; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1; |
| ; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1; |
| ; CHECK64-NEXT: ret; |
| ; |
| ; CHECKPTX71-LABEL: test( |
| ; CHECKPTX71: { |
| ; CHECKPTX71-NEXT: .reg .pred %p<5>; |
| ; CHECKPTX71-NEXT: .reg .b16 %rs<22>; |
| ; CHECKPTX71-NEXT: .reg .b32 %r<4>; |
| ; CHECKPTX71-NEXT: .reg .f32 %f<12>; |
| ; CHECKPTX71-EMPTY: |
| ; CHECKPTX71-NEXT: // %bb.0: |
| ; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3]; |
| ; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2]; |
| ; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1]; |
| ; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0]; |
| ; CHECKPTX71-NEXT: ld.b16 %rs18, [%r1]; |
| ; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13; |
| ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14 |
| ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs18; |
| ; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1; |
| ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3; |
| ; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs18, %rs14; |
| ; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs18; |
| ; CHECKPTX71-NEXT: mov.u16 %rs18, %rs3; |
| ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; |
| ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13 |
| ; CHECKPTX71-NEXT: ld.b16 %rs19, [%r1]; |
| ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8 |
| ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs19; |
| ; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; |
| ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs15, %f5; |
| ; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs19, %rs15; |
| ; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs19; |
| ; CHECKPTX71-NEXT: mov.u16 %rs19, %rs6; |
| ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; |
| ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7 |
| ; CHECKPTX71-NEXT: ld.global.b16 %rs20, [%r2]; |
| ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2 |
| ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs20; |
| ; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1; |
| ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f8; |
| ; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16; |
| ; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs20; |
| ; CHECKPTX71-NEXT: mov.u16 %rs20, %rs9; |
| ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; |
| ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1 |
| ; CHECKPTX71-NEXT: ld.shared.b16 %rs21, [%r3]; |
| ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start |
| ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs21; |
| ; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1; |
| ; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs17, %f11; |
| ; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17; |
| ; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs21; |
| ; CHECKPTX71-NEXT: mov.u16 %rs21, %rs12; |
| ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; |
| ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end |
| ; CHECKPTX71-NEXT: ret; |
| %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst |
| %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst |
| %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst |
| %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst |
| ret void |
| } |
| |
| attributes #1 = { argmemonly nounwind } |