| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 | 
 | // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ | 
 | // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=HOST %s | 
 | // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ | 
 | // RUN:   -emit-llvm -o - -fcuda-is-device %s | FileCheck --check-prefix=DEV %s | 
 | // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ | 
 | // RUN:   -fatomic-fine-grained-memory -fatomic-ignore-denormal-mode \ | 
 | // RUN:   -emit-llvm -o - -fcuda-is-device %s | FileCheck --check-prefix=OPT %s | 
 |  | 
 | #include "Inputs/cuda.h" | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z12test_defaultPf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z12test_defaultPf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z12test_defaultPf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_default(float *a) { | 
 |   __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 | } | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z8test_onePf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z8test_onePf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z8test_onePf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_one(float *a) { | 
 |   [[clang::atomic(no_remote_memory)]] { | 
 |     __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   } | 
 | } | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z8test_twoPf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z8test_twoPf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z8test_twoPf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_two(float *a) { | 
 |   [[clang::atomic(remote_memory, ignore_denormal_mode)]] { | 
 |     __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   } | 
 | } | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z10test_threePf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z10test_threePf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z10test_threePf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_three(float *a) { | 
 |   [[clang::atomic(no_remote_memory, fine_grained_memory, no_ignore_denormal_mode)]] { | 
 |     __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   } | 
 | } | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z19test_multiple_attrsPf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z19test_multiple_attrsPf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z19test_multiple_attrsPf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_multiple_attrs(float *a) { | 
 |   [[clang::atomic(no_remote_memory)]] [[clang::atomic(remote_memory)]] { | 
 |     __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   } | 
 | } | 
 |  | 
 | // HOST-LABEL: define dso_local void @_Z11test_nestedPf( | 
 | // HOST-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // HOST-NEXT:  [[ENTRY:.*:]] | 
 | // HOST-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8 | 
 | // HOST-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca float, align 4 | 
 | // HOST-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 | 
 | // HOST-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4 | 
 | // HOST-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 | 
 | // HOST-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4 | 
 | // HOST-NEXT:    [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4 | 
 | // HOST-NEXT:    [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] seq_cst, align 4 | 
 | // HOST-NEXT:    store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4 | 
 | // HOST-NEXT:    [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4 | 
 | // HOST-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4 | 
 | // HOST-NEXT:    [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4 | 
 | // HOST-NEXT:    [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] acquire, align 4 | 
 | // HOST-NEXT:    store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4 | 
 | // HOST-NEXT:    [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4 | 
 | // HOST-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8 | 
 | // HOST-NEXT:    store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4 | 
 | // HOST-NEXT:    [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4 | 
 | // HOST-NEXT:    [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] release, align 4 | 
 | // HOST-NEXT:    store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4 | 
 | // HOST-NEXT:    [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4 | 
 | // HOST-NEXT:    ret void | 
 | // | 
 | // DEV-LABEL: define dso_local void @_Z11test_nestedPf( | 
 | // DEV-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // DEV-NEXT:  [[ENTRY:.*:]] | 
 | // DEV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) | 
 | // DEV-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr | 
 | // DEV-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr | 
 | // DEV-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr | 
 | // DEV-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 | 
 | // DEV-NEXT:    store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // DEV-NEXT:    store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] | 
 | // DEV-NEXT:    store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 | 
 | // DEV-NEXT:    [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 | 
 | // DEV-NEXT:    ret void | 
 | // | 
 | // OPT-LABEL: define dso_local void @_Z11test_nestedPf( | 
 | // OPT-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { | 
 | // OPT-NEXT:  [[ENTRY:.*:]] | 
 | // OPT-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) | 
 | // OPT-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr | 
 | // OPT-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr | 
 | // OPT-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr | 
 | // OPT-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] | 
 | // OPT-NEXT:    store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 | 
 | // OPT-NEXT:    store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] | 
 | // OPT-NEXT:    store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 | 
 | // OPT-NEXT:    store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] | 
 | // OPT-NEXT:    store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 | 
 | // OPT-NEXT:    [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 | 
 | // OPT-NEXT:    ret void | 
 | // | 
 | __device__ __host__ void test_nested(float *a) { | 
 |   __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   { | 
 |     [[clang::atomic(remote_memory, fine_grained_memory, no_ignore_denormal_mode)]] { | 
 |       __scoped_atomic_fetch_max(a, 2, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); | 
 |       { | 
 |         [[clang::atomic(no_remote_memory)]] { | 
 |           __scoped_atomic_fetch_min(a, 3, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WRKGRP); | 
 |         } | 
 |       } | 
 |       { | 
 |         [[clang::atomic(no_fine_grained_memory)]] { | 
 |           __scoped_atomic_fetch_sub(a, 4, __ATOMIC_RELEASE, __MEMORY_SCOPE_WVFRNT); | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | // | 
 | // | 
 | // | 
 | // | 
 | template<typename T> __device__ __host__ void test_template(T *a) { | 
 |   [[clang::atomic(no_remote_memory, fine_grained_memory)]] { | 
 |     __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); | 
 |   } | 
 | } | 
 |  | 
 | template __device__ __host__ void test_template<float>(float *a); | 
 |  | 
 | //. | 
 | // DEV: [[META4]] = !{} | 
 | //. | 
 | // OPT: [[META4]] = !{} | 
 | //. |