|  | ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s | 
|  |  | 
|  | ! Test CUDA Fortran procedures available in cudadevice module | 
|  |  | 
|  | attributes(global) subroutine devsub() | 
|  | implicit none | 
|  | integer :: ret | 
|  | real(2) :: r2 | 
|  | real(4) :: af | 
|  | real(8) :: ad | 
|  | integer(4) :: ai | 
|  | integer(8) :: al | 
|  | integer(8) :: time | 
|  | integer :: smalltime | 
|  | integer(4) :: res, offset | 
|  | integer(8) :: resl | 
|  |  | 
|  | integer :: tid | 
|  | tid = threadIdx%x | 
|  |  | 
|  | call syncthreads() | 
|  | call syncwarp(1) | 
|  | call threadfence() | 
|  | call threadfence_block() | 
|  | call threadfence_system() | 
|  | ret = syncthreads_and(1) | 
|  | res = syncthreads_and(tid > offset) | 
|  | ret = syncthreads_count(1) | 
|  | ret = syncthreads_count(tid > offset) | 
|  | ret = syncthreads_or(1) | 
|  | ret = syncthreads_or(tid > offset) | 
|  |  | 
|  | ai = atomicadd(ai, 1_4) | 
|  | al = atomicadd(al, 1_8) | 
|  | af = atomicadd(af, 1.0_4) | 
|  | ad = atomicadd(ad, 1.0_8) | 
|  |  | 
|  | ai = atomicsub(ai, 1_4) | 
|  | al = atomicsub(al, 1_8) | 
|  | af = atomicsub(af, 1.0_4) | 
|  | ad = atomicsub(ad, 1.0_8) | 
|  |  | 
|  | ai = atomicmax(ai, 1_4) | 
|  | al = atomicmax(al, 1_8) | 
|  | af = atomicmax(af, 1.0_4) | 
|  | ad = atomicmax(ad, 1.0_8) | 
|  |  | 
|  | ai = atomicmin(ai, 1_4) | 
|  | al = atomicmin(al, 1_8) | 
|  | af = atomicmin(af, 1.0_4) | 
|  | ad = atomicmin(ad, 1.0_8) | 
|  |  | 
|  | ai = atomicand(ai, 1_4) | 
|  | ai = atomicor(ai, 1_4) | 
|  | ai = atomicinc(ai, 1_4) | 
|  | ai = atomicdec(ai, 1_4) | 
|  |  | 
|  | smalltime = clock() | 
|  | time = clock64() | 
|  | time = globalTimer() | 
|  |  | 
|  | res = __popc(ai) | 
|  | res = __popc(al) | 
|  | res = __ffs(ai) | 
|  | res = __ffs(al) | 
|  | res = __brev(ai) | 
|  | resl = __brev(al) | 
|  |  | 
|  | res = __clz(ai) | 
|  | res = __clz(al) | 
|  | af = __cosf(af) | 
|  | ad = __ddiv_rn(ad, ad) | 
|  | ad = __ddiv_rz(ad, ad) | 
|  | ad = __ddiv_ru(ad, ad) | 
|  | ad = __ddiv_rd(ad, ad) | 
|  | af = __double2float_rn(ad) | 
|  | af = __double2float_rz(ad) | 
|  | af = __double2float_ru(ad) | 
|  | af = __double2float_rd(ad) | 
|  | ai = __double2int_rd(ad) | 
|  | ai = __double2int_rn(ad) | 
|  | ai = __double2int_ru(ad) | 
|  | ai = __double2int_rz(ad) | 
|  | ai = __double2uint_rd(ad) | 
|  | ai = __double2uint_rn(ad) | 
|  | ai = __double2uint_ru(ad) | 
|  | ai = __double2uint_rz(ad) | 
|  | ai = __mul24(ai, ai) | 
|  | ai = __umul24(ai, ai) | 
|  | af = __powf(af, af) | 
|  | ad = __ull2double_rd(al) | 
|  | ad = __ull2double_rn(al) | 
|  | ad = __ull2double_ru(al) | 
|  | ad = __ull2double_rz(al) | 
|  | r2 = __float2half_rn(af) | 
|  | af = __half2float(r2) | 
|  | ad = __ll2double_rd(al) | 
|  | ad = __ll2double_rn(al) | 
|  | ad = __ll2double_ru(al) | 
|  | ad = __ll2double_rz(al) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} | 
|  | ! CHECK: nvvm.barrier0 | 
|  | ! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> () | 
|  | ! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> () | 
|  | ! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> () | 
|  | ! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> () | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32 | 
|  | ! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%[[CONV]]) | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32 | 
|  | ! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%[[CONV]]) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32> | 
|  | ! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32 | 
|  | ! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%[[CONV]]) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 | 
|  |  | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 | 
|  |  | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 | 
|  |  | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 | 
|  |  | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw _and  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw _or  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  |  | 
|  | ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock : i32 | 
|  | ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock64 : i64 | 
|  | ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64 | 
|  |  | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_popc(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_popcll(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ffs(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ffsll(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_brev(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_brevll(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> i64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_clz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_clzll(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_fast_cosf(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ddiv_rn(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ddiv_rz(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ddiv_ru(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ddiv_rd(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64, f64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2float_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2int_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2int_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2int_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2int_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2uint_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2uint_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2uint_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_double2uint_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_mul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_umul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_fast_powf(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32, f32) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ull2double_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ull2double_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ull2double_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ull2double_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_float2half_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f16 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_half2float(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f16) -> f32 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ll2double_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ll2double_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ll2double_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  | ! CHECK: %{{.*}} = fir.call @__nv_ll2double_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64 | 
|  |  | 
|  | subroutine host1() | 
|  | integer, device :: a(32) | 
|  | integer, device :: ret | 
|  | integer :: i, j | 
|  |  | 
|  | block; use cudadevice | 
|  | !$cuf kernel do(1) <<<*,32>>> | 
|  | do i = 1, 32 | 
|  | a(i) = a(i) * 2.0 | 
|  | call syncthreads() | 
|  | a(i) = a(i) + a(j) - 34.0 | 
|  |  | 
|  | call syncwarp(1) | 
|  | ret = syncthreads_and(1) | 
|  | ret = syncthreads_count(1) | 
|  | ret = syncthreads_or(1) | 
|  | end do | 
|  | end block | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPhost1() | 
|  | ! CHECK: cuf.kernel | 
|  | ! CHECK: nvvm.barrier0 | 
|  | ! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> () | 
|  | ! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  | ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32 | 
|  |  | 
|  | attributes(device) subroutine testMatch() | 
|  | integer :: a, ipred, mask, v32 | 
|  | integer(8) :: v64 | 
|  | real(4) :: r4 | 
|  | real(8) :: r8 | 
|  | a = match_all_sync(mask, v32, ipred) | 
|  | a = match_all_sync(mask, v64, ipred) | 
|  | a = match_all_sync(mask, r4, ipred) | 
|  | a = match_all_sync(mask, r8, ipred) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestmatch() | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  all %{{.*}}, %{{.*}} : i32 -> !llvm.struct<(i32, i1)> | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  all %{{.*}}, %{{.*}} : i64 -> !llvm.struct<(i32, i1)> | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  all %{{.*}}, %{{.*}} : i32 -> !llvm.struct<(i32, i1)> | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  all %{{.*}}, %{{.*}} : i64 -> !llvm.struct<(i32, i1)> | 
|  |  | 
|  | attributes(device) subroutine testMatchAny() | 
|  | integer :: a, mask, v32 | 
|  | integer(8) :: v64 | 
|  | real(4) :: r4 | 
|  | real(8) :: r8 | 
|  | a = match_any_sync(mask, v32) | 
|  | a = match_any_sync(mask, v64) | 
|  | a = match_any_sync(mask, r4) | 
|  | a = match_any_sync(mask, r8) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestmatchany() | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  any %{{.*}}, %{{.*}} : i32 -> i32 | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  any %{{.*}}, %{{.*}} : i64 -> i32 | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  any %{{.*}}, %{{.*}} : i32 -> i32 | 
|  | ! CHECK: %{{.*}} = nvvm.match.sync  any %{{.*}}, %{{.*}} : i64 -> i32 | 
|  |  | 
|  | attributes(device) subroutine testAtomic(aa, n) | 
|  | integer :: aa(*) | 
|  | integer, intent(in) :: n | 
|  | integer :: a, istat, j, i | 
|  | real :: r | 
|  | istat = atomicexch(a,0) | 
|  | istat = atomicexch(r, 0.0) | 
|  | istat = atomicxor(a, j) | 
|  | istat = atomiccas(a, i, 14) | 
|  | do i = 1, n | 
|  | istat = atomicxor(aa, i) | 
|  | istat = atomiccas(aa, i, 14) | 
|  | istat = atomicexch(aa, 0) | 
|  | end do | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestatomic | 
|  | ! CHECK: llvm.atomicrmw xchg %{{.*}}, %c0{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: llvm.atomicrmw xchg %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 | 
|  | ! CHECK: llvm.atomicrmw _xor %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#0 : !fir.ref<i32> to !llvm.ptr | 
|  | ! CHECK: llvm.cmpxchg %[[ADDR]], %{{.*}}, %c14{{.*}} acq_rel monotonic : !llvm.ptr, i32 | 
|  | ! CHECK: fir.do_loop | 
|  | ! CHECK: llvm.atomicrmw _xor %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 | 
|  | ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<!fir.array<?xi32>> to !llvm.ptr | 
|  | ! CHECK: llvm.cmpxchg %[[ADDR]], %{{.*}}, %c14{{.*}} acq_rel monotonic : !llvm.ptr, i32 | 
|  | ! CHECK: llvm.atomicrmw xchg %{{.*}}, %c0{{.*}} seq_cst : !llvm.ptr, i32 | 
|  |  | 
|  | attributes(device) subroutine testAtomic2() | 
|  | integer(8) :: a, i, istat | 
|  | istat = atomiccas(a, i, 14) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestatomic2() | 
|  | ! CHECK: %[[VAL:.*]] = fir.convert %c14{{.*}} : (i32) -> i64 | 
|  | ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#0 : !fir.ref<i64> to !llvm.ptr | 
|  | ! CHECK: llvm.cmpxchg %{{.*}}, %{{.*}}, %[[VAL]] acq_rel monotonic : !llvm.ptr, i64 | 
|  |  | 
|  | attributes(device) subroutine testAtomic3() | 
|  | real :: a, i, istat | 
|  | istat = atomiccas(a, i, 14.0) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestatomic3() | 
|  | ! CHECK: %[[BCAST1:.*]] = llvm.bitcast %{{.*}} : f32 to i32 | 
|  | ! CHECK: %[[BCAST2:.*]] = llvm.bitcast %{{.*}} : f32 to i32 | 
|  | ! CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %{{.*}}#0 : !fir.ref<f32> to !llvm.ptr | 
|  | ! CHECK: llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i32 | 
|  |  | 
|  | attributes(device) subroutine testAtomic4() | 
|  | real(8) :: a, i, istat | 
|  | istat = atomiccas(a, i, 14.0d0) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestatomic4() | 
|  | ! CHECK: %[[BCAST1:.*]] = llvm.bitcast %{{.*}} : f64 to i64 | 
|  | ! CHECK: %[[BCAST2:.*]] = llvm.bitcast %{{.*}} : f64 to i64 | 
|  | ! CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %{{.*}}#0 : !fir.ref<f64> to !llvm.ptr | 
|  | ! CHECK: %[[ATOMIC:.*]] = llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i64 | 
|  | ! CHECK: %[[RES:.*]] = llvm.extractvalue %[[ATOMIC]][1] : !llvm.struct<(i64, i1)> | 
|  |  | 
|  | attributes(global) subroutine __ldXXi4(b) | 
|  | integer, device :: b(*) | 
|  | integer, device :: x(4) | 
|  | x(1:4) = __ldca(b(i:j)) | 
|  | x = __ldcg(b(i:j)) | 
|  | x = __ldcs(b(i:j)) | 
|  | x(1:4) = __ldlu(b(i:j)) | 
|  | x(1:4) = __ldcv(b(i:j)) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QP__ldxxi4 | 
|  | ! CHECK: fir.call @__ldca_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> () | 
|  | ! CHECK: fir.call @__ldcg_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> () | 
|  | ! CHECK: fir.call @__ldcs_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> () | 
|  | ! CHECK: fir.call @__ldlu_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> () | 
|  | ! CHECK: fir.call @__ldcv_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> () | 
|  |  | 
|  | attributes(global) subroutine __ldXXi8(b) | 
|  | integer(8), device :: b(*) | 
|  | integer(8), device :: x(2) | 
|  | x(1:2) = __ldca(b(i:j)) | 
|  | x = __ldcg(b(i:j)) | 
|  | x = __ldcs(b(i:j)) | 
|  | x(1:2) = __ldlu(b(i:j)) | 
|  | x(1:2) = __ldcv(b(i:j)) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QP__ldxxi8 | 
|  | ! CHECK: fir.call @__ldca_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> () | 
|  | ! CHECK: fir.call @__ldcg_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> () | 
|  | ! CHECK: fir.call @__ldcs_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> () | 
|  | ! CHECK: fir.call @__ldlu_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> () | 
|  | ! CHECK: fir.call @__ldcv_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> () | 
|  |  | 
|  | attributes(global) subroutine __ldXXr4(b) | 
|  | real, device :: b(*) | 
|  | real, device :: x(4) | 
|  | x(1:4) = __ldca(b(i:j)) | 
|  | x = __ldcg(b(i:j)) | 
|  | x = __ldcs(b(i:j)) | 
|  | x(1:4) = __ldlu(b(i:j)) | 
|  | x(1:4) = __ldcv(b(i:j)) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QP__ldxxr4 | 
|  | ! CHECK: fir.call @__ldca_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> () | 
|  | ! CHECK: fir.call @__ldcg_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> () | 
|  | ! CHECK: fir.call @__ldcs_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> () | 
|  | ! CHECK: fir.call @__ldlu_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> () | 
|  | ! CHECK: fir.call @__ldcv_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> () | 
|  |  | 
|  | attributes(global) subroutine __ldXXr2(b) | 
|  | real(2), device :: b(*) | 
|  | real(2), device :: x(2) | 
|  | x(1:2) = __ldca(b(i:j)) | 
|  | x = __ldcg(b(i:j)) | 
|  | x = __ldcs(b(i:j)) | 
|  | x(1:2) = __ldlu(b(i:j)) | 
|  | x(1:2) = __ldcv(b(i:j)) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QP__ldxxr2 | 
|  | ! CHECK: fir.call @__ldca_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> () | 
|  | ! CHECK: fir.call @__ldcg_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> () | 
|  | ! CHECK: fir.call @__ldcs_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> () | 
|  | ! CHECK: fir.call @__ldlu_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> () | 
|  | ! CHECK: fir.call @__ldcv_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> () | 
|  |  | 
|  | attributes(global) subroutine __ldXXr8(b) | 
|  | real(8), device :: b(*) | 
|  | real(8), device :: x(2) | 
|  | x(1:2) = __ldca(b(i:j)) | 
|  | x = __ldcg(b(i:j)) | 
|  | x = __ldcs(b(i:j)) | 
|  | x(1:2) = __ldlu(b(i:j)) | 
|  | x(1:2) = __ldcv(b(i:j)) | 
|  | end | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QP__ldxxr8 | 
|  | ! CHECK: fir.call @__ldca_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> () | 
|  | ! CHECK: fir.call @__ldcg_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> () | 
|  | ! CHECK: fir.call @__ldcs_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> () | 
|  | ! CHECK: fir.call @__ldlu_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> () | 
|  | ! CHECK: fir.call @__ldcv_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> () | 
|  |  | 
|  | attributes(device) subroutine testVote() | 
|  | integer :: a, ipred, mask | 
|  | logical(4) :: pred | 
|  | a = all_sync(mask, pred) | 
|  | a = any_sync(mask, pred) | 
|  | a = ballot_sync(mask, pred) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestvote() | 
|  | ! CHECK: %{{.*}} = nvvm.vote.sync all %{{.*}}, %{{.*}} -> i1 | 
|  | ! CHECK: %{{.*}} = nvvm.vote.sync any %{{.*}}, %{{.*}} -> i1 | 
|  | ! CHECK: %{{.*}} = nvvm.vote.sync ballot %{{.*}}, %{{.*}} -> i32 | 
|  |  | 
|  | attributes(global) subroutine test_barrier() | 
|  | integer(8), shared :: barrier | 
|  | integer(8) :: token | 
|  | integer :: count | 
|  | call barrier_init(barrier, 256) | 
|  |  | 
|  | token = barrier_arrive(barrier) | 
|  |  | 
|  | token = barrier_arrive(barrier, count) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtest_barrier() | 
|  |  | 
|  | ! CHECK: %[[SHARED:.*]] = cuf.shared_memory i64 {bindc_name = "barrier", uniq_name = "_QFtest_barrierEbarrier"} -> !fir.ref<i64> | 
|  | ! CHECK: %[[DECL_SHARED:.*]]:2 = hlfir.declare %[[SHARED]] {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_barrierEbarrier"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>) | 
|  | ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32 | 
|  | ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr | 
|  | ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> | 
|  | ! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 | 
|  | ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} | 
|  |  | 
|  | ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr | 
|  | ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> | 
|  | ! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 | 
|  |  | 
|  | ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr | 
|  | ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> | 
|  | ! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 | 
|  |  | 
|  |  | 
|  | attributes(global) subroutine test_fence() | 
|  | call fence_proxy_async() | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtest_fence() | 
|  | ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} | 
|  |  | 
|  | attributes(global) subroutine test_tma() | 
|  | call tma_bulk_commit_group() | 
|  | call tma_bulk_wait_group() | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtest_tma() | 
|  | ! CHECK: nvvm.cp.async.bulk.commit.group | 
|  | ! CHECK: nvvm.cp.async.bulk.wait_group 0 | 
|  |  | 
|  | attributes(global) subroutine test_bulk_g2s(a) | 
|  | real(8), device :: a(*) | 
|  | real(8), shared :: tmpa(1024) | 
|  | integer(8), shared :: barrier1 | 
|  | integer(4) :: tx_count | 
|  | call tma_bulk_g2s(barrier1, a(j), tmpa, tx_count) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtest_bulk_g2s | 
|  | ! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1> | 
|  |  | 
|  | attributes(global) subroutine test_bulk_s2g(a) | 
|  | real(8), device :: a(*) | 
|  | real(8), shared :: tmpa(1024) | 
|  | integer(4) :: tx_count | 
|  | call tma_bulk_s2g(tmpa, a(j), tx_count) | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtest_bulk_s2g | 
|  | ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> | 
|  |  | 
|  | attributes(device) subroutine testAtomicCasLoop(aa, n) | 
|  | integer :: a | 
|  | do while (atomiccas(a, 0, 1) == 1) | 
|  | end do | 
|  | end subroutine | 
|  |  | 
|  | ! CHECK-LABEL: func.func @_QPtestatomiccasloop | 
|  | ! CHECK: %[[CMP_XCHG:.*]] = llvm.cmpxchg %15, %c0_i32, %c1_i32 acq_rel monotonic : !llvm.ptr, i32 | 
|  | ! CHECK: %[[CMP_XCHG_EV:.*]] = llvm.extractvalue %[[CMP_XCHG]][1] : !llvm.struct<(i32, i1)> | 
|  | ! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32 | 
|  | ! CHECK: %{{.*}} = arith.constant 1 : i32 | 
|  | ! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32 |