flang/test/Lower/CUDA/cuda-device-proc.cuf - llvm-project - Git at Google

 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s

 ! Test CUDA Fortran procedures available in cudadevice module

 attributes(global) subroutine devsub()
   implicit none
   integer :: ret
   real(4) :: af
   real(8) :: ad
   integer(4) :: ai
   integer(8) :: al
   integer(8) :: time

   call syncthreads()
   call syncwarp(1)
   call threadfence()
   call threadfence_block()
   call threadfence_system()
   ret = syncthreads_and(1)
   ret = syncthreads_count(1)
   ret = syncthreads_or(1)

   ai = atomicadd(ai, 1_4)
   al = atomicadd(al, 1_8)
   af = atomicadd(af, 1.0_4)
   ad = atomicadd(ad, 1.0_8)

   ai = atomicsub(ai, 1_4)
   al = atomicsub(al, 1_8)
   af = atomicsub(af, 1.0_4)
   ad = atomicsub(ad, 1.0_8)

   ai = atomicmax(ai, 1_4)
   al = atomicmax(al, 1_8)
   af = atomicmax(af, 1.0_4)
   ad = atomicmax(ad, 1.0_8)

   ai = atomicmin(ai, 1_4)
   al = atomicmin(al, 1_8)
   af = atomicmin(af, 1.0_4)
   ad = atomicmin(ad, 1.0_8)

   ai = atomicand(ai, 1_4)
   ai = atomicor(ai, 1_4)
   ai = atomicinc(ai, 1_4)
   ai = atomicdec(ai, 1_4)

   time = clock64()
 end

 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: fir.call @llvm.nvvm.barrier0() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64

 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fsub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64

 ! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw max  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fmax %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64

 ! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw min  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fmin %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64

 ! CHECK: %{{.*}} = llvm.atomicrmw _and  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw _or  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32

 ! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64()

 subroutine host1()
   integer, device :: a(32)
   integer, device :: ret
   integer :: i, j

 block; use cudadevice
   !$cuf kernel do(1) <<<*,32>>>
   do i = 1, 32
     a(i) = a(i) * 2.0
     call syncthreads()
     a(i) = a(i) + a(j) - 34.0

     call syncwarp(1)
     ret = syncthreads_and(1)
     ret = syncthreads_count(1)
     ret = syncthreads_or(1)
   end do
 end block
 end

 ! CHECK-LABEL: func.func @_QPhost1()
 ! CHECK: cuf.kernel
 ! CHECK: fir.call @llvm.nvvm.barrier0() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
 ! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32

 attributes(device) subroutine testMatch()
   integer :: a, ipred, mask, v32
   integer(8) :: v64
   real(4) :: r4
   real(8) :: r8
   a = match_all_sync(mask, v32, ipred)
   a = match_all_sync(mask, v64, ipred)
   a = match_all_sync(mask, r4, ipred)
   a = match_all_sync(mask, r8, ipred)
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestmatch()
 ! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
 ! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p
 ! CHECK: fir.convert %{{.*}} : (f32) -> i32
 ! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
 ! CHECK: fir.convert %{{.*}} : (f64) -> i64
 ! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p

 attributes(device) subroutine testMatchAny()
   integer :: a, mask, v32
   integer(8) :: v64
   real(4) :: r4
   real(8) :: r8
   a = match_any_sync(mask, v32)
   a = match_any_sync(mask, v64)
   a = match_any_sync(mask, r4)
   a = match_any_sync(mask, r8)
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestmatchany()
 ! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p
 ! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p
 ! CHECK: fir.convert %{{.*}} : (f32) -> i32
 ! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p
 ! CHECK: fir.convert %{{.*}} : (f64) -> i64
 ! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p

 attributes(device) subroutine testAtomic(aa, n)
   integer :: aa(*)
   integer, intent(in) :: n
   integer :: a, istat, j, i
   real :: r
   istat = atomicexch(a,0)
   istat = atomicexch(r, 0.0)
   istat = atomicxor(a, j)
   istat = atomiccas(a, i, 14)
   do i = 1, n
     istat = atomicxor(aa, i)
     istat = atomiccas(aa, i, 14)
     istat = atomicexch(aa, 0)
   end do
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestatomic
 ! CHECK: llvm.atomicrmw xchg %{{.*}}, %c0{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: llvm.atomicrmw xchg %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: llvm.atomicrmw _xor %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<i32> to !llvm.ptr
 ! CHECK: llvm.cmpxchg %[[ADDR]], %{{.*}}, %c14{{.*}} acq_rel monotonic : !llvm.ptr, i32
 ! CHECK: fir.do_loop
 ! CHECK: llvm.atomicrmw _xor %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<!fir.array<?xi32>> to !llvm.ptr
 ! CHECK: llvm.cmpxchg %[[ADDR]], %{{.*}}, %c14{{.*}} acq_rel monotonic : !llvm.ptr, i32
 ! CHECK: llvm.atomicrmw xchg %{{.*}}, %c0{{.*}} seq_cst : !llvm.ptr, i32

 attributes(device) subroutine testAtomic2()
   integer(8) :: a, i, istat
   istat = atomiccas(a, i, 14)
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestatomic2()
 ! CHECK: %[[VAL:.*]] = fir.convert %c14{{.*}} : (i32) -> i64
 ! CHECK: %[[ADDR:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<i64> to !llvm.ptr
 ! CHECK: llvm.cmpxchg %{{.*}}, %{{.*}}, %[[VAL]] acq_rel monotonic : !llvm.ptr, i64

 attributes(device) subroutine testAtomic3()
   real :: a, i, istat
   istat = atomiccas(a, i, 14.0)
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestatomic3()
 ! CHECK: %[[BCAST1:.*]] = llvm.bitcast %{{.*}} : f32 to i32
 ! CHECK: %[[BCAST2:.*]] = llvm.bitcast %{{.*}} : f32 to i32
 ! CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<f32> to !llvm.ptr
 ! CHECK: llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i32

 attributes(device) subroutine testAtomic4()
   real(8) :: a, i, istat
   istat = atomiccas(a, i, 14.0d0)
 end subroutine

 ! CHECK-LABEL: func.func @_QPtestatomic4()
 ! CHECK: %[[BCAST1:.*]] = llvm.bitcast %{{.*}} : f64 to i64
 ! CHECK: %[[BCAST2:.*]] = llvm.bitcast %{{.*}} : f64 to i64
 ! CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %{{.*}}#1 : !fir.ref<f64> to !llvm.ptr
 ! CHECK: %[[ATOMIC:.*]] = llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i64
 ! CHECK: %[[RES:.*]] = llvm.extractvalue %[[ATOMIC]][1] : !llvm.struct<(i64, i1)>

 attributes(global) subroutine __ldXXi4(b)
   integer, device :: b(*)
   integer, device :: x(4)
   x(1:4) = __ldca(b(i:j))
   x = __ldcg(b(i:j))
   x = __ldcs(b(i:j))
   x(1:4) = __ldlu(b(i:j))
   x(1:4) = __ldcv(b(i:j))
 end

 ! CHECK-LABEL: func.func @_QP__ldxxi4
 ! CHECK: fir.call @__ldca_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
 ! CHECK: fir.call @__ldcg_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
 ! CHECK: fir.call @__ldcs_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
 ! CHECK: fir.call @__ldlu_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
 ! CHECK: fir.call @__ldcv_i4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()

 attributes(global) subroutine __ldXXi8(b)
   integer(8), device :: b(*)
   integer(8), device :: x(2)
   x(1:2) = __ldca(b(i:j))
   x = __ldcg(b(i:j))
   x = __ldcs(b(i:j))
   x(1:2) = __ldlu(b(i:j))
   x(1:2) = __ldcv(b(i:j))
 end

 ! CHECK-LABEL: func.func @_QP__ldxxi8
 ! CHECK: fir.call @__ldca_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
 ! CHECK: fir.call @__ldcg_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
 ! CHECK: fir.call @__ldcs_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
 ! CHECK: fir.call @__ldlu_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
 ! CHECK: fir.call @__ldcv_i8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()

 attributes(global) subroutine __ldXXr4(b)
   real, device :: b(*)
   real, device :: x(4)
   x(1:4) = __ldca(b(i:j))
   x = __ldcg(b(i:j))
   x = __ldcs(b(i:j))
   x(1:4) = __ldlu(b(i:j))
   x(1:4) = __ldcv(b(i:j))
 end

 ! CHECK-LABEL: func.func @_QP__ldxxr4
 ! CHECK: fir.call @__ldca_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK: fir.call @__ldcg_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK: fir.call @__ldcs_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK: fir.call @__ldlu_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK: fir.call @__ldcv_r4x4_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()

 attributes(global) subroutine __ldXXr2(b)
   real(2), device :: b(*)
   real(2), device :: x(2)
   x(1:2) = __ldca(b(i:j))
   x = __ldcg(b(i:j))
   x = __ldcs(b(i:j))
   x(1:2) = __ldlu(b(i:j))
   x(1:2) = __ldcv(b(i:j))
 end

 ! CHECK-LABEL: func.func @_QP__ldxxr2
 ! CHECK: fir.call @__ldca_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
 ! CHECK: fir.call @__ldcg_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
 ! CHECK: fir.call @__ldcs_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
 ! CHECK: fir.call @__ldlu_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
 ! CHECK: fir.call @__ldcv_r2x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()

 attributes(global) subroutine __ldXXr8(b)
   real(8), device :: b(*)
   real(8), device :: x(2)
   x(1:2) = __ldca(b(i:j))
   x = __ldcg(b(i:j))
   x = __ldcs(b(i:j))
   x(1:2) = __ldlu(b(i:j))
   x(1:2) = __ldcv(b(i:j))
 end

 ! CHECK-LABEL: func.func @_QP__ldxxr8
 ! CHECK: fir.call @__ldca_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
 ! CHECK: fir.call @__ldcg_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
 ! CHECK: fir.call @__ldcs_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
 ! CHECK: fir.call @__ldlu_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
 ! CHECK: fir.call @__ldcv_r8x2_(%{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
	! RUN: bbc -emit-hlfir -fcuda %s -o - \| FileCheck %s

	! Test CUDA Fortran procedures available in cudadevice module

	attributes(global) subroutine devsub()
	implicit none
	integer :: ret
	real(4) :: af
	real(8) :: ad
	integer(4) :: ai
	integer(8) :: al
	integer(8) :: time

	call syncthreads()
	call syncwarp(1)
	call threadfence()
	call threadfence_block()
	call threadfence_system()
	ret = syncthreads_and(1)
	ret = syncthreads_count(1)
	ret = syncthreads_or(1)

	ai = atomicadd(ai, 1_4)
	al = atomicadd(al, 1_8)
	af = atomicadd(af, 1.0_4)
	ad = atomicadd(ad, 1.0_8)

	ai = atomicsub(ai, 1_4)
	al = atomicsub(al, 1_8)
	af = atomicsub(af, 1.0_4)
	ad = atomicsub(ad, 1.0_8)

	ai = atomicmax(ai, 1_4)
	al = atomicmax(al, 1_8)
	af = atomicmax(af, 1.0_4)
	ad = atomicmax(ad, 1.0_8)

	ai = atomicmin(ai, 1_4)
	al = atomicmin(al, 1_8)
	af = atomicmin(af, 1.0_4)
	ad = atomicmin(ad, 1.0_8)

	ai = atomicand(ai, 1_4)
	ai = atomicor(ai, 1_4)
	ai = atomicinc(ai, 1_4)
	ai = atomicdec(ai, 1_4)

	time = clock64()
	end

	! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
	! CHECK: fir.call @llvm.nvvm.barrier0() fastmath<contract> : () -> ()
	! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
	! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
	! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
	! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
	! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
	! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
	! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
	! CHECK: %{{.}} = llvm.atomicrmw add %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw add %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i64
	! CHECK: %{{.}} = llvm.atomicrmw fadd %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f32
	! CHECK: %{{.}} = llvm.atomicrmw fadd %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f64

	! CHECK: %{{.}} = llvm.atomicrmw sub %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw sub %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i64
	! CHECK: %{{.}} = llvm.atomicrmw fsub %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f32
	! CHECK: %{{.}} = llvm.atomicrmw fsub %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f64

	! CHECK: %{{.}} = llvm.atomicrmw max %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw max %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i64
	! CHECK: %{{.}} = llvm.atomicrmw fmax %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f32
	! CHECK: %{{.}} = llvm.atomicrmw fmax %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f64

	! CHECK: %{{.}} = llvm.atomicrmw min %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw min %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i64
	! CHECK: %{{.}} = llvm.atomicrmw fmin %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f32
	! CHECK: %{{.}} = llvm.atomicrmw fmin %{{.}}, %{{.*}} seq_cst : !llvm.ptr, f64

	! CHECK: %{{.}} = llvm.atomicrmw _and %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw _or %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw uinc_wrap %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32
	! CHECK: %{{.}} = llvm.atomicrmw udec_wrap %{{.}}, %{{.*}} seq_cst : !llvm.ptr, i32

	! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64()

	subroutine host1()
	integer, device :: a(32)
	integer, device :: ret
	integer :: i, j

	block; use cudadevice
	!$cuf kernel do(1) <<<*,32>>>
	do i = 1, 32
	a(i) = a(i) * 2.0
	call syncthreads()
	a(i) = a(i) + a(j) - 34.0

	call syncwarp(1)
	ret = syncthreads_and(1)
	ret = syncthreads_count(1)
	ret = syncthreads_or(1)
	end do
	end block
	end

	! CHECK-LABEL: func.func @_QPhost1()
	! CHECK: cuf.kernel
	! CHECK: fir.call @llvm.nvvm.barrier0() fastmath<contract> : () -> ()
	! CHECK: fir.call @llvm.nvvm.bar.warp.sync(%c1{{.*}}) fastmath<contract> : (i32) -> ()
	! CHECK: fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
	! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
	! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32

	attributes(device) subroutine testMatch()
	integer :: a, ipred, mask, v32
	integer(8) :: v64
	real(4) :: r4
	real(8) :: r8
	a = match_all_sync(mask, v32, ipred)
	a = match_all_sync(mask, v64, ipred)
	a = match_all_sync(mask, r4, ipred)
	a = match_all_sync(mask, r8, ipred)
	end subroutine

	! CHECK-LABEL: func.func @_QPtestmatch()
	! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
	! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p
	! CHECK: fir.convert %{{.*}} : (f32) -> i32
	! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
	! CHECK: fir.convert %{{.*}} : (f64) -> i64
	! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p

	attributes(device) subroutine testMatchAny()
	integer :: a, mask, v32
	integer(8) :: v64
	real(4) :: r4
	real(8) :: r8
	a = match_any_sync(mask, v32)
	a = match_any_sync(mask, v64)
	a = match_any_sync(mask, r4)
	a = match_any_sync(mask, r8)
	end subroutine

	! CHECK-LABEL: func.func @_QPtestmatchany()
	! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p
	! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p
	! CHECK: fir.convert %{{.*}} : (f32) -> i32
	! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p
	! CHECK: fir.convert %{{.*}} : (f64) -> i64
	! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p

	attributes(device) subroutine testAtomic(aa, n)
	integer :: aa(*)
	integer, intent(in) :: n
	integer :: a, istat, j, i
	real :: r
	istat = atomicexch(a,0)
	istat = atomicexch(r, 0.0)
	istat = atomicxor(a, j)
	istat = atomiccas(a, i, 14)
	do i = 1, n
	istat = atomicxor(aa, i)
	istat = atomiccas(aa, i, 14)
	istat = atomicexch(aa, 0)
	end do
	end subroutine

	! CHECK-LABEL: func.func @_QPtestatomic
	! CHECK: llvm.atomicrmw xchg %{{.}}, %c0{{.}} seq_cst : !llvm.ptr, i32
	! CHECK: llvm.atomicrmw xchg %{{.}}, %{{.}} seq_cst : !llvm.ptr, f32
	! CHECK: llvm.atomicrmw _xor %{{.}}, %{{.}} seq_cst : !llvm.ptr, i32
	! CHECK: %[[ADDR:.]] = builtin.unrealized_conversion_cast %{{.}}#1 : !fir.ref<i32> to !llvm.ptr
	! CHECK: llvm.cmpxchg %[[ADDR]], %{{.}}, %c14{{.}} acq_rel monotonic : !llvm.ptr, i32
	! CHECK: fir.do_loop
	! CHECK: llvm.atomicrmw _xor %{{.}}, %{{.}} seq_cst : !llvm.ptr, i32
	! CHECK: %[[ADDR:.]] = builtin.unrealized_conversion_cast %{{.}}#1 : !fir.ref<!fir.array<?xi32>> to !llvm.ptr
	! CHECK: llvm.cmpxchg %[[ADDR]], %{{.}}, %c14{{.}} acq_rel monotonic : !llvm.ptr, i32
	! CHECK: llvm.atomicrmw xchg %{{.}}, %c0{{.}} seq_cst : !llvm.ptr, i32

	attributes(device) subroutine testAtomic2()
	integer(8) :: a, i, istat
	istat = atomiccas(a, i, 14)
	end subroutine

	! CHECK-LABEL: func.func @_QPtestatomic2()
	! CHECK: %[[VAL:.]] = fir.convert %c14{{.}} : (i32) -> i64
	! CHECK: %[[ADDR:.]] = builtin.unrealized_conversion_cast %{{.}}#1 : !fir.ref<i64> to !llvm.ptr
	! CHECK: llvm.cmpxchg %{{.}}, %{{.}}, %[[VAL]] acq_rel monotonic : !llvm.ptr, i64

	attributes(device) subroutine testAtomic3()
	real :: a, i, istat
	istat = atomiccas(a, i, 14.0)
	end subroutine

	! CHECK-LABEL: func.func @_QPtestatomic3()
	! CHECK: %[[BCAST1:.]] = llvm.bitcast %{{.}} : f32 to i32
	! CHECK: %[[BCAST2:.]] = llvm.bitcast %{{.}} : f32 to i32
	! CHECK: %[[CAST:.]] = builtin.unrealized_conversion_cast %{{.}}#1 : !fir.ref<f32> to !llvm.ptr
	! CHECK: llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i32

	attributes(device) subroutine testAtomic4()
	real(8) :: a, i, istat
	istat = atomiccas(a, i, 14.0d0)
	end subroutine

	! CHECK-LABEL: func.func @_QPtestatomic4()
	! CHECK: %[[BCAST1:.]] = llvm.bitcast %{{.}} : f64 to i64
	! CHECK: %[[BCAST2:.]] = llvm.bitcast %{{.}} : f64 to i64
	! CHECK: %[[CAST:.]] = builtin.unrealized_conversion_cast %{{.}}#1 : !fir.ref<f64> to !llvm.ptr
	! CHECK: %[[ATOMIC:.*]] = llvm.cmpxchg %[[CAST]], %[[BCAST1]], %[[BCAST2]] acq_rel monotonic : !llvm.ptr, i64
	! CHECK: %[[RES:.*]] = llvm.extractvalue %[[ATOMIC]][1] : !llvm.struct<(i64, i1)>

	attributes(global) subroutine __ldXXi4(b)
	integer, device :: b(*)
	integer, device :: x(4)
	x(1:4) = __ldca(b(i:j))
	x = __ldcg(b(i:j))
	x = __ldcs(b(i:j))
	x(1:4) = __ldlu(b(i:j))
	x(1:4) = __ldcv(b(i:j))
	end

	! CHECK-LABEL: func.func @_QP__ldxxi4
	! CHECK: fir.call @__ldca_i4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
	! CHECK: fir.call @__ldcg_i4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
	! CHECK: fir.call @__ldcs_i4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
	! CHECK: fir.call @__ldlu_i4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
	! CHECK: fir.call @__ldcv_i4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()

	attributes(global) subroutine __ldXXi8(b)
	integer(8), device :: b(*)
	integer(8), device :: x(2)
	x(1:2) = __ldca(b(i:j))
	x = __ldcg(b(i:j))
	x = __ldcs(b(i:j))
	x(1:2) = __ldlu(b(i:j))
	x(1:2) = __ldcv(b(i:j))
	end

	! CHECK-LABEL: func.func @_QP__ldxxi8
	! CHECK: fir.call @__ldca_i8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
	! CHECK: fir.call @__ldcg_i8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
	! CHECK: fir.call @__ldcs_i8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
	! CHECK: fir.call @__ldlu_i8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()
	! CHECK: fir.call @__ldcv_i8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xi64>>, !fir.ref<!fir.array<?xi64>>) -> ()

	attributes(global) subroutine __ldXXr4(b)
	real, device :: b(*)
	real, device :: x(4)
	x(1:4) = __ldca(b(i:j))
	x = __ldcg(b(i:j))
	x = __ldcs(b(i:j))
	x(1:4) = __ldlu(b(i:j))
	x(1:4) = __ldcv(b(i:j))
	end

	! CHECK-LABEL: func.func @_QP__ldxxr4
	! CHECK: fir.call @__ldca_r4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
	! CHECK: fir.call @__ldcg_r4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
	! CHECK: fir.call @__ldcs_r4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
	! CHECK: fir.call @__ldlu_r4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()
	! CHECK: fir.call @__ldcv_r4x4_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<4xf32>>, !fir.ref<!fir.array<?xf32>>) -> ()

	attributes(global) subroutine __ldXXr2(b)
	real(2), device :: b(*)
	real(2), device :: x(2)
	x(1:2) = __ldca(b(i:j))
	x = __ldcg(b(i:j))
	x = __ldcs(b(i:j))
	x(1:2) = __ldlu(b(i:j))
	x(1:2) = __ldcv(b(i:j))
	end

	! CHECK-LABEL: func.func @_QP__ldxxr2
	! CHECK: fir.call @__ldca_r2x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
	! CHECK: fir.call @__ldcg_r2x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
	! CHECK: fir.call @__ldcs_r2x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
	! CHECK: fir.call @__ldlu_r2x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()
	! CHECK: fir.call @__ldcv_r2x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf16>>, !fir.ref<!fir.array<?xf16>>) -> ()

	attributes(global) subroutine __ldXXr8(b)
	real(8), device :: b(*)
	real(8), device :: x(2)
	x(1:2) = __ldca(b(i:j))
	x = __ldcg(b(i:j))
	x = __ldcs(b(i:j))
	x(1:2) = __ldlu(b(i:j))
	x(1:2) = __ldcv(b(i:j))
	end

	! CHECK-LABEL: func.func @_QP__ldxxr8
	! CHECK: fir.call @__ldca_r8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
	! CHECK: fir.call @__ldcg_r8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
	! CHECK: fir.call @__ldcs_r8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
	! CHECK: fir.call @__ldlu_r8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()
	! CHECK: fir.call @__ldcv_r8x2_(%{{.}}, %{{.}}) fastmath<contract> : (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<?xf64>>) -> ()