test/CodeGen/X86/atomic-idempotent.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs                           | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs           -mattr=+sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SSE2
 ; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
 ; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
 ; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2      | FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
 ; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2     | FileCheck %s --check-prefixes=X86,X86-ATOM

 ; On x86, an atomic rmw operation that does not modify the value in memory
 ; (such as atomic add 0) can be replaced by an mfence followed by a mov.
 ; This is explained (with the motivation for such an optimization) in
 ; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf

 define i8 @add8(i8* %p) {
 ; X64-LABEL: add8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
 ; X64-NEXT:    movb (%rdi), %al
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: add8:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
 ; X86-SSE2-NEXT:    movb (%eax), %al
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SLM-LABEL: add8:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLM-NEXT:    xorl %eax, %eax
 ; X86-SLM-NEXT:    lock xaddb %al, (%ecx)
 ; X86-SLM-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-ATOM-NEXT:    xorl %eax, %eax
 ; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
 ; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw add i8* %p, i8 0 monotonic
   ret i8 %1
 }

 define i16 @or16(i16* %p) {
 ; X64-LABEL: or16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: or16:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
 ; X86-SSE2-NEXT:    movzwl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SLM-LABEL: or16:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLM-NEXT:    movzwl (%ecx), %eax
 ; X86-SLM-NEXT:    .p2align 4, 0x90
 ; X86-SLM-NEXT:  .LBB1_1: # %atomicrmw.start
 ; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-SLM-NEXT:    lock cmpxchgw %ax, (%ecx)
 ; X86-SLM-NEXT:    jne .LBB1_1
 ; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-ATOM-NEXT:    movzwl (%ecx), %eax
 ; X86-ATOM-NEXT:    .p2align 4, 0x90
 ; X86-ATOM-NEXT:  .LBB1_1: # %atomicrmw.start
 ; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-ATOM-NEXT:    lock cmpxchgw %ax, (%ecx)
 ; X86-ATOM-NEXT:    jne .LBB1_1
 ; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or i16* %p, i16 0 acquire
   ret i16 %1
 }

 define i32 @xor32(i32* %p) {
 ; X64-LABEL: xor32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: xor32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SLM-LABEL: xor32:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLM-NEXT:    movl (%ecx), %eax
 ; X86-SLM-NEXT:    .p2align 4, 0x90
 ; X86-SLM-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
 ; X86-SLM-NEXT:    jne .LBB2_1
 ; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-ATOM-NEXT:    movl (%ecx), %eax
 ; X86-ATOM-NEXT:    .p2align 4, 0x90
 ; X86-ATOM-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
 ; X86-ATOM-NEXT:    jne .LBB2_1
 ; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor i32* %p, i32 0 release
   ret i32 %1
 }

 define i64 @sub64(i64* %p) {
 ; X64-LABEL: sub64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
 ; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: sub64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    lock cmpxchg8b (%esi)
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = atomicrmw sub i64* %p, i64 0 seq_cst
   ret i64 %1
 }

 define i128 @or128(i128* %p) {
 ; X64-LABEL: or128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    callq __sync_fetch_and_or_16
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: or128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-8, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    .cfi_offset %esi, -16
 ; X86-SSE2-NEXT:    .cfi_offset %edi, -12
 ; X86-SSE2-NEXT:    movl 8(%ebp), %esi
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl 12(%ebp)
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SSE2-NEXT:    addl $20, %esp
 ; X86-SSE2-NEXT:    movl (%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE2-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE2-NEXT:    movl %edx, 12(%esi)
 ; X86-SSE2-NEXT:    movl %eax, (%esi)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE2-NEXT:    movl %esi, %eax
 ; X86-SSE2-NEXT:    leal -8(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SLM-LABEL: or128:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    pushl %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SLM-NEXT:    .cfi_offset %ebp, -8
 ; X86-SLM-NEXT:    movl %esp, %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SLM-NEXT:    pushl %edi
 ; X86-SLM-NEXT:    pushl %esi
 ; X86-SLM-NEXT:    andl $-8, %esp
 ; X86-SLM-NEXT:    subl $16, %esp
 ; X86-SLM-NEXT:    .cfi_offset %esi, -16
 ; X86-SLM-NEXT:    .cfi_offset %edi, -12
 ; X86-SLM-NEXT:    movl 8(%ebp), %esi
 ; X86-SLM-NEXT:    movl 12(%ebp), %eax
 ; X86-SLM-NEXT:    movl %esp, %ecx
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl %eax
 ; X86-SLM-NEXT:    pushl %ecx
 ; X86-SLM-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SLM-NEXT:    addl $20, %esp
 ; X86-SLM-NEXT:    movl (%esp), %eax
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLM-NEXT:    movl %edi, 8(%esi)
 ; X86-SLM-NEXT:    movl %edx, 12(%esi)
 ; X86-SLM-NEXT:    movl %eax, (%esi)
 ; X86-SLM-NEXT:    movl %ecx, 4(%esi)
 ; X86-SLM-NEXT:    movl %esi, %eax
 ; X86-SLM-NEXT:    leal -8(%ebp), %esp
 ; X86-SLM-NEXT:    popl %esi
 ; X86-SLM-NEXT:    popl %edi
 ; X86-SLM-NEXT:    popl %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SLM-NEXT:    retl $4
 ;
 ; X86-ATOM-LABEL: or128:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
 ; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    leal (%esp), %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
 ; X86-ATOM-NEXT:    andl $-8, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    .cfi_offset %esi, -16
 ; X86-ATOM-NEXT:    .cfi_offset %edi, -12
 ; X86-ATOM-NEXT:    movl 8(%ebp), %esi
 ; X86-ATOM-NEXT:    movl 12(%ebp), %eax
 ; X86-ATOM-NEXT:    movl %esp, %ecx
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl %eax
 ; X86-ATOM-NEXT:    pushl %ecx
 ; X86-ATOM-NEXT:    calll __sync_fetch_and_or_16
 ; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    movl (%esp), %ecx
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-ATOM-NEXT:    movl %eax, 8(%esi)
 ; X86-ATOM-NEXT:    movl %edi, 12(%esi)
 ; X86-ATOM-NEXT:    movl %ecx, (%esi)
 ; X86-ATOM-NEXT:    movl %esi, %eax
 ; X86-ATOM-NEXT:    movl %edx, 4(%esi)
 ; X86-ATOM-NEXT:    leal -8(%ebp), %esp
 ; X86-ATOM-NEXT:    popl %esi
 ; X86-ATOM-NEXT:    popl %edi
 ; X86-ATOM-NEXT:    popl %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl $4
   %1 = atomicrmw or i128* %p, i128 0 monotonic
   ret i128 %1
 }

 ; For 'and', the idempotent value is (-1)
 define i32 @and32 (i32* %p) {
 ; X64-LABEL: and32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    mfence
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: and32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SLM-LABEL: and32:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLM-NEXT:    movl (%ecx), %eax
 ; X86-SLM-NEXT:    .p2align 4, 0x90
 ; X86-SLM-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
 ; X86-SLM-NEXT:    jne .LBB5_1
 ; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-ATOM-NEXT:    movl (%ecx), %eax
 ; X86-ATOM-NEXT:    .p2align 4, 0x90
 ; X86-ATOM-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
 ; X86-ATOM-NEXT:    jne .LBB5_1
 ; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and i32* %p, i32 -1 acq_rel
   ret i32 %1
 }

 define void @or32_nouse_monotonic(i32* %p) {
 ; X64-LABEL: or32_nouse_monotonic:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or32_nouse_monotonic:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    #MEMBARRIER
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or32_nouse_monotonic:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    #MEMBARRIER
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 monotonic
   ret void
 }


 define void @or32_nouse_acquire(i32* %p) {
 ; X64-LABEL: or32_nouse_acquire:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or32_nouse_acquire:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    #MEMBARRIER
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or32_nouse_acquire:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    #MEMBARRIER
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 acquire
   ret void
 }

 define void @or32_nouse_release(i32* %p) {
 ; X64-LABEL: or32_nouse_release:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or32_nouse_release:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    #MEMBARRIER
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or32_nouse_release:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    #MEMBARRIER
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 release
   ret void
 }

 define void @or32_nouse_acq_rel(i32* %p) {
 ; X64-LABEL: or32_nouse_acq_rel:
 ; X64:       # %bb.0:
 ; X64-NEXT:    #MEMBARRIER
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or32_nouse_acq_rel:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    #MEMBARRIER
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or32_nouse_acq_rel:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    #MEMBARRIER
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 acq_rel
   ret void
 }

 define void @or32_nouse_seq_cst(i32* %p) {
 ; X64-LABEL: or32_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or32_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    lock orl $0, (%esp)
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or32_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    lock orl $0, (%esp)
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i32* %p, i32 0 seq_cst
   ret void
 }

 ; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
 define void @or64_nouse_seq_cst(i64* %p) {
 ; X64-LABEL: or64_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: or64_nouse_seq_cst:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl 4(%esi), %edx
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB11_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    lock cmpxchg8b (%esi)
 ; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   atomicrmw or i64* %p, i64 0 seq_cst
   ret void
 }

 ; TODO: Don't need to lower as sync_and_fetch call
 define void @or128_nouse_seq_cst(i128* %p) {
 ; X64-LABEL: or128_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    callq __sync_fetch_and_or_16
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: or128_nouse_seq_cst:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SSE2-NEXT:    andl $-8, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl 8(%ebp)
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SSE2-NEXT:    addl $20, %esp
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SLM-LABEL: or128_nouse_seq_cst:
 ; X86-SLM:       # %bb.0:
 ; X86-SLM-NEXT:    pushl %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SLM-NEXT:    .cfi_offset %ebp, -8
 ; X86-SLM-NEXT:    movl %esp, %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SLM-NEXT:    andl $-8, %esp
 ; X86-SLM-NEXT:    subl $16, %esp
 ; X86-SLM-NEXT:    movl 8(%ebp), %eax
 ; X86-SLM-NEXT:    movl %esp, %ecx
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl $0
 ; X86-SLM-NEXT:    pushl %eax
 ; X86-SLM-NEXT:    pushl %ecx
 ; X86-SLM-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SLM-NEXT:    addl $20, %esp
 ; X86-SLM-NEXT:    movl %ebp, %esp
 ; X86-SLM-NEXT:    popl %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or128_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    pushl %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_offset 8
 ; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    leal (%esp), %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    andl $-8, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    movl 8(%ebp), %eax
 ; X86-ATOM-NEXT:    movl %esp, %ecx
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl $0
 ; X86-ATOM-NEXT:    pushl %eax
 ; X86-ATOM-NEXT:    pushl %ecx
 ; X86-ATOM-NEXT:    calll __sync_fetch_and_or_16
 ; X86-ATOM-NEXT:    leal {{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    movl %ebp, %esp
 ; X86-ATOM-NEXT:    popl %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i128* %p, i128 0 seq_cst
   ret void
 }


 define void @or16_nouse_seq_cst(i16* %p) {
 ; X64-LABEL: or16_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or16_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    lock orl $0, (%esp)
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    lock orl $0, (%esp)
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i16* %p, i16 0 seq_cst
   ret void
 }

 define void @or8_nouse_seq_cst(i8* %p) {
 ; X64-LABEL: or8_nouse_seq_cst:
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
 ;
 ; X86-GENERIC-LABEL: or8_nouse_seq_cst:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    lock orl $0, (%esp)
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or8_nouse_seq_cst:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    lock orl $0, (%esp)
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   atomicrmw or i8* %p, i8 0 seq_cst
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs \| FileCheck %s --check-prefix=X64
	; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mattr=+sse2 \| FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SSE2
	; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=slm -mattr=-sse2 \| FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
	; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=goldmont -mattr=-sse2 \| FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
	; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=knl -mattr=-sse2 \| FileCheck %s --check-prefixes=X86,X86-GENERIC,X86-SLM
	; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs -mcpu=atom -mattr=-sse2 \| FileCheck %s --check-prefixes=X86,X86-ATOM

	; On x86, an atomic rmw operation that does not modify the value in memory
	; (such as atomic add 0) can be replaced by an mfence followed by a mov.
	; This is explained (with the motivation for such an optimization) in
	; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf

	define i8 @add8(i8* %p) {
	; X64-LABEL: add8:
	; X64: # %bb.0:
	; X64-NEXT: mfence
	; X64-NEXT: movb (%rdi), %al
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: add8:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: mfence
	; X86-SSE2-NEXT: movb (%eax), %al
	; X86-SSE2-NEXT: retl
	;
	; X86-SLM-LABEL: add8:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SLM-NEXT: xorl %eax, %eax
	; X86-SLM-NEXT: lock xaddb %al, (%ecx)
	; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
	; X86-SLM-NEXT: retl
	;
	; X86-ATOM-LABEL: add8:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-ATOM-NEXT: xorl %eax, %eax
	; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
	; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	%1 = atomicrmw add i8* %p, i8 0 monotonic
	ret i8 %1
	}

	define i16 @or16(i16* %p) {
	; X64-LABEL: or16:
	; X64: # %bb.0:
	; X64-NEXT: mfence
	; X64-NEXT: movzwl (%rdi), %eax
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: or16:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: mfence
	; X86-SSE2-NEXT: movzwl (%eax), %eax
	; X86-SSE2-NEXT: retl
	;
	; X86-SLM-LABEL: or16:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SLM-NEXT: movzwl (%ecx), %eax
	; X86-SLM-NEXT: .p2align 4, 0x90
	; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
	; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
	; X86-SLM-NEXT: jne .LBB1_1
	; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-SLM-NEXT: retl
	;
	; X86-ATOM-LABEL: or16:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-ATOM-NEXT: movzwl (%ecx), %eax
	; X86-ATOM-NEXT: .p2align 4, 0x90
	; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
	; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
	; X86-ATOM-NEXT: jne .LBB1_1
	; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-ATOM-NEXT: retl
	%1 = atomicrmw or i16* %p, i16 0 acquire
	ret i16 %1
	}

	define i32 @xor32(i32* %p) {
	; X64-LABEL: xor32:
	; X64: # %bb.0:
	; X64-NEXT: mfence
	; X64-NEXT: movl (%rdi), %eax
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: xor32:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: mfence
	; X86-SSE2-NEXT: movl (%eax), %eax
	; X86-SSE2-NEXT: retl
	;
	; X86-SLM-LABEL: xor32:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SLM-NEXT: movl (%ecx), %eax
	; X86-SLM-NEXT: .p2align 4, 0x90
	; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
	; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
	; X86-SLM-NEXT: jne .LBB2_1
	; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-SLM-NEXT: retl
	;
	; X86-ATOM-LABEL: xor32:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-ATOM-NEXT: movl (%ecx), %eax
	; X86-ATOM-NEXT: .p2align 4, 0x90
	; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
	; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
	; X86-ATOM-NEXT: jne .LBB2_1
	; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-ATOM-NEXT: retl
	%1 = atomicrmw xor i32* %p, i32 0 release
	ret i32 %1
	}

	define i64 @sub64(i64* %p) {
	; X64-LABEL: sub64:
	; X64: # %bb.0:
	; X64-NEXT: mfence
	; X64-NEXT: movq (%rdi), %rax
	; X64-NEXT: retq
	;
	; X86-LABEL: sub64:
	; X86: # %bb.0:
	; X86-NEXT: pushl %ebx
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: pushl %esi
	; X86-NEXT: .cfi_def_cfa_offset 12
	; X86-NEXT: .cfi_offset %esi, -12
	; X86-NEXT: .cfi_offset %ebx, -8
	; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
	; X86-NEXT: movl (%esi), %eax
	; X86-NEXT: movl 4(%esi), %edx
	; X86-NEXT: .p2align 4, 0x90
	; X86-NEXT: .LBB3_1: # %atomicrmw.start
	; X86-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-NEXT: movl %edx, %ecx
	; X86-NEXT: movl %eax, %ebx
	; X86-NEXT: lock cmpxchg8b (%esi)
	; X86-NEXT: jne .LBB3_1
	; X86-NEXT: # %bb.2: # %atomicrmw.end
	; X86-NEXT: popl %esi
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: popl %ebx
	; X86-NEXT: .cfi_def_cfa_offset 4
	; X86-NEXT: retl
	%1 = atomicrmw sub i64* %p, i64 0 seq_cst
	ret i64 %1
	}

	define i128 @or128(i128* %p) {
	; X64-LABEL: or128:
	; X64: # %bb.0:
	; X64-NEXT: pushq %rax
	; X64-NEXT: .cfi_def_cfa_offset 16
	; X64-NEXT: xorl %esi, %esi
	; X64-NEXT: xorl %edx, %edx
	; X64-NEXT: callq __sync_fetch_and_or_16
	; X64-NEXT: popq %rcx
	; X64-NEXT: .cfi_def_cfa_offset 8
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: or128:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: pushl %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE2-NEXT: .cfi_offset %ebp, -8
	; X86-SSE2-NEXT: movl %esp, %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
	; X86-SSE2-NEXT: pushl %edi
	; X86-SSE2-NEXT: pushl %esi
	; X86-SSE2-NEXT: andl $-8, %esp
	; X86-SSE2-NEXT: subl $16, %esp
	; X86-SSE2-NEXT: .cfi_offset %esi, -16
	; X86-SSE2-NEXT: .cfi_offset %edi, -12
	; X86-SSE2-NEXT: movl 8(%ebp), %esi
	; X86-SSE2-NEXT: movl %esp, %eax
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl 12(%ebp)
	; X86-SSE2-NEXT: pushl %eax
	; X86-SSE2-NEXT: calll __sync_fetch_and_or_16
	; X86-SSE2-NEXT: addl $20, %esp
	; X86-SSE2-NEXT: movl (%esp), %eax
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
	; X86-SSE2-NEXT: movl %edi, 8(%esi)
	; X86-SSE2-NEXT: movl %edx, 12(%esi)
	; X86-SSE2-NEXT: movl %eax, (%esi)
	; X86-SSE2-NEXT: movl %ecx, 4(%esi)
	; X86-SSE2-NEXT: movl %esi, %eax
	; X86-SSE2-NEXT: leal -8(%ebp), %esp
	; X86-SSE2-NEXT: popl %esi
	; X86-SSE2-NEXT: popl %edi
	; X86-SSE2-NEXT: popl %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
	; X86-SSE2-NEXT: retl $4
	;
	; X86-SLM-LABEL: or128:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: pushl %ebp
	; X86-SLM-NEXT: .cfi_def_cfa_offset 8
	; X86-SLM-NEXT: .cfi_offset %ebp, -8
	; X86-SLM-NEXT: movl %esp, %ebp
	; X86-SLM-NEXT: .cfi_def_cfa_register %ebp
	; X86-SLM-NEXT: pushl %edi
	; X86-SLM-NEXT: pushl %esi
	; X86-SLM-NEXT: andl $-8, %esp
	; X86-SLM-NEXT: subl $16, %esp
	; X86-SLM-NEXT: .cfi_offset %esi, -16
	; X86-SLM-NEXT: .cfi_offset %edi, -12
	; X86-SLM-NEXT: movl 8(%ebp), %esi
	; X86-SLM-NEXT: movl 12(%ebp), %eax
	; X86-SLM-NEXT: movl %esp, %ecx
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl %eax
	; X86-SLM-NEXT: pushl %ecx
	; X86-SLM-NEXT: calll __sync_fetch_and_or_16
	; X86-SLM-NEXT: addl $20, %esp
	; X86-SLM-NEXT: movl (%esp), %eax
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edi
	; X86-SLM-NEXT: movl %edi, 8(%esi)
	; X86-SLM-NEXT: movl %edx, 12(%esi)
	; X86-SLM-NEXT: movl %eax, (%esi)
	; X86-SLM-NEXT: movl %ecx, 4(%esi)
	; X86-SLM-NEXT: movl %esi, %eax
	; X86-SLM-NEXT: leal -8(%ebp), %esp
	; X86-SLM-NEXT: popl %esi
	; X86-SLM-NEXT: popl %edi
	; X86-SLM-NEXT: popl %ebp
	; X86-SLM-NEXT: .cfi_def_cfa %esp, 4
	; X86-SLM-NEXT: retl $4
	;
	; X86-ATOM-LABEL: or128:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: pushl %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa_offset 8
	; X86-ATOM-NEXT: .cfi_offset %ebp, -8
	; X86-ATOM-NEXT: leal (%esp), %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa_register %ebp
	; X86-ATOM-NEXT: pushl %edi
	; X86-ATOM-NEXT: pushl %esi
	; X86-ATOM-NEXT: andl $-8, %esp
	; X86-ATOM-NEXT: leal -{{[0-9]+}}(%esp), %esp
	; X86-ATOM-NEXT: .cfi_offset %esi, -16
	; X86-ATOM-NEXT: .cfi_offset %edi, -12
	; X86-ATOM-NEXT: movl 8(%ebp), %esi
	; X86-ATOM-NEXT: movl 12(%ebp), %eax
	; X86-ATOM-NEXT: movl %esp, %ecx
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl %eax
	; X86-ATOM-NEXT: pushl %ecx
	; X86-ATOM-NEXT: calll __sync_fetch_and_or_16
	; X86-ATOM-NEXT: leal {{[0-9]+}}(%esp), %esp
	; X86-ATOM-NEXT: movl (%esp), %ecx
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edi
	; X86-ATOM-NEXT: movl %eax, 8(%esi)
	; X86-ATOM-NEXT: movl %edi, 12(%esi)
	; X86-ATOM-NEXT: movl %ecx, (%esi)
	; X86-ATOM-NEXT: movl %esi, %eax
	; X86-ATOM-NEXT: movl %edx, 4(%esi)
	; X86-ATOM-NEXT: leal -8(%ebp), %esp
	; X86-ATOM-NEXT: popl %esi
	; X86-ATOM-NEXT: popl %edi
	; X86-ATOM-NEXT: popl %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa %esp, 4
	; X86-ATOM-NEXT: retl $4
	%1 = atomicrmw or i128* %p, i128 0 monotonic
	ret i128 %1
	}

	; For 'and', the idempotent value is (-1)
	define i32 @and32 (i32* %p) {
	; X64-LABEL: and32:
	; X64: # %bb.0:
	; X64-NEXT: mfence
	; X64-NEXT: movl (%rdi), %eax
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: and32:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-SSE2-NEXT: mfence
	; X86-SSE2-NEXT: movl (%eax), %eax
	; X86-SSE2-NEXT: retl
	;
	; X86-SLM-LABEL: and32:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-SLM-NEXT: movl (%ecx), %eax
	; X86-SLM-NEXT: .p2align 4, 0x90
	; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
	; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
	; X86-SLM-NEXT: jne .LBB5_1
	; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-SLM-NEXT: retl
	;
	; X86-ATOM-LABEL: and32:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X86-ATOM-NEXT: movl (%ecx), %eax
	; X86-ATOM-NEXT: .p2align 4, 0x90
	; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
	; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
	; X86-ATOM-NEXT: jne .LBB5_1
	; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
	; X86-ATOM-NEXT: retl
	%1 = atomicrmw and i32* %p, i32 -1 acq_rel
	ret i32 %1
	}

	define void @or32_nouse_monotonic(i32* %p) {
	; X64-LABEL: or32_nouse_monotonic:
	; X64: # %bb.0:
	; X64-NEXT: #MEMBARRIER
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or32_nouse_monotonic:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: #MEMBARRIER
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or32_nouse_monotonic:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: #MEMBARRIER
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 monotonic
	ret void
	}


	define void @or32_nouse_acquire(i32* %p) {
	; X64-LABEL: or32_nouse_acquire:
	; X64: # %bb.0:
	; X64-NEXT: #MEMBARRIER
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or32_nouse_acquire:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: #MEMBARRIER
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or32_nouse_acquire:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: #MEMBARRIER
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 acquire
	ret void
	}

	define void @or32_nouse_release(i32* %p) {
	; X64-LABEL: or32_nouse_release:
	; X64: # %bb.0:
	; X64-NEXT: #MEMBARRIER
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or32_nouse_release:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: #MEMBARRIER
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or32_nouse_release:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: #MEMBARRIER
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 release
	ret void
	}

	define void @or32_nouse_acq_rel(i32* %p) {
	; X64-LABEL: or32_nouse_acq_rel:
	; X64: # %bb.0:
	; X64-NEXT: #MEMBARRIER
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or32_nouse_acq_rel:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: #MEMBARRIER
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or32_nouse_acq_rel:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: #MEMBARRIER
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 acq_rel
	ret void
	}

	define void @or32_nouse_seq_cst(i32* %p) {
	; X64-LABEL: or32_nouse_seq_cst:
	; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or32_nouse_seq_cst:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or32_nouse_seq_cst:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: lock orl $0, (%esp)
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i32* %p, i32 0 seq_cst
	ret void
	}

	; TODO: The value isn't used on 32 bit, so the cmpxchg8b is unneeded
	define void @or64_nouse_seq_cst(i64* %p) {
	; X64-LABEL: or64_nouse_seq_cst:
	; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq
	;
	; X86-LABEL: or64_nouse_seq_cst:
	; X86: # %bb.0:
	; X86-NEXT: pushl %ebx
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: pushl %esi
	; X86-NEXT: .cfi_def_cfa_offset 12
	; X86-NEXT: .cfi_offset %esi, -12
	; X86-NEXT: .cfi_offset %ebx, -8
	; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
	; X86-NEXT: movl (%esi), %eax
	; X86-NEXT: movl 4(%esi), %edx
	; X86-NEXT: .p2align 4, 0x90
	; X86-NEXT: .LBB11_1: # %atomicrmw.start
	; X86-NEXT: # =>This Inner Loop Header: Depth=1
	; X86-NEXT: movl %edx, %ecx
	; X86-NEXT: movl %eax, %ebx
	; X86-NEXT: lock cmpxchg8b (%esi)
	; X86-NEXT: jne .LBB11_1
	; X86-NEXT: # %bb.2: # %atomicrmw.end
	; X86-NEXT: popl %esi
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: popl %ebx
	; X86-NEXT: .cfi_def_cfa_offset 4
	; X86-NEXT: retl
	atomicrmw or i64* %p, i64 0 seq_cst
	ret void
	}

	; TODO: Don't need to lower as sync_and_fetch call
	define void @or128_nouse_seq_cst(i128* %p) {
	; X64-LABEL: or128_nouse_seq_cst:
	; X64: # %bb.0:
	; X64-NEXT: pushq %rax
	; X64-NEXT: .cfi_def_cfa_offset 16
	; X64-NEXT: xorl %esi, %esi
	; X64-NEXT: xorl %edx, %edx
	; X64-NEXT: callq __sync_fetch_and_or_16
	; X64-NEXT: popq %rax
	; X64-NEXT: .cfi_def_cfa_offset 8
	; X64-NEXT: retq
	;
	; X86-SSE2-LABEL: or128_nouse_seq_cst:
	; X86-SSE2: # %bb.0:
	; X86-SSE2-NEXT: pushl %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
	; X86-SSE2-NEXT: .cfi_offset %ebp, -8
	; X86-SSE2-NEXT: movl %esp, %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp
	; X86-SSE2-NEXT: andl $-8, %esp
	; X86-SSE2-NEXT: subl $16, %esp
	; X86-SSE2-NEXT: movl %esp, %eax
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl $0
	; X86-SSE2-NEXT: pushl 8(%ebp)
	; X86-SSE2-NEXT: pushl %eax
	; X86-SSE2-NEXT: calll __sync_fetch_and_or_16
	; X86-SSE2-NEXT: addl $20, %esp
	; X86-SSE2-NEXT: movl %ebp, %esp
	; X86-SSE2-NEXT: popl %ebp
	; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4
	; X86-SSE2-NEXT: retl
	;
	; X86-SLM-LABEL: or128_nouse_seq_cst:
	; X86-SLM: # %bb.0:
	; X86-SLM-NEXT: pushl %ebp
	; X86-SLM-NEXT: .cfi_def_cfa_offset 8
	; X86-SLM-NEXT: .cfi_offset %ebp, -8
	; X86-SLM-NEXT: movl %esp, %ebp
	; X86-SLM-NEXT: .cfi_def_cfa_register %ebp
	; X86-SLM-NEXT: andl $-8, %esp
	; X86-SLM-NEXT: subl $16, %esp
	; X86-SLM-NEXT: movl 8(%ebp), %eax
	; X86-SLM-NEXT: movl %esp, %ecx
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl $0
	; X86-SLM-NEXT: pushl %eax
	; X86-SLM-NEXT: pushl %ecx
	; X86-SLM-NEXT: calll __sync_fetch_and_or_16
	; X86-SLM-NEXT: addl $20, %esp
	; X86-SLM-NEXT: movl %ebp, %esp
	; X86-SLM-NEXT: popl %ebp
	; X86-SLM-NEXT: .cfi_def_cfa %esp, 4
	; X86-SLM-NEXT: retl
	;
	; X86-ATOM-LABEL: or128_nouse_seq_cst:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: pushl %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa_offset 8
	; X86-ATOM-NEXT: .cfi_offset %ebp, -8
	; X86-ATOM-NEXT: leal (%esp), %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa_register %ebp
	; X86-ATOM-NEXT: andl $-8, %esp
	; X86-ATOM-NEXT: leal -{{[0-9]+}}(%esp), %esp
	; X86-ATOM-NEXT: movl 8(%ebp), %eax
	; X86-ATOM-NEXT: movl %esp, %ecx
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl $0
	; X86-ATOM-NEXT: pushl %eax
	; X86-ATOM-NEXT: pushl %ecx
	; X86-ATOM-NEXT: calll __sync_fetch_and_or_16
	; X86-ATOM-NEXT: leal {{[0-9]+}}(%esp), %esp
	; X86-ATOM-NEXT: movl %ebp, %esp
	; X86-ATOM-NEXT: popl %ebp
	; X86-ATOM-NEXT: .cfi_def_cfa %esp, 4
	; X86-ATOM-NEXT: retl
	atomicrmw or i128* %p, i128 0 seq_cst
	ret void
	}


	define void @or16_nouse_seq_cst(i16* %p) {
	; X64-LABEL: or16_nouse_seq_cst:
	; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or16_nouse_seq_cst:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or16_nouse_seq_cst:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: lock orl $0, (%esp)
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i16* %p, i16 0 seq_cst
	ret void
	}

	define void @or8_nouse_seq_cst(i8* %p) {
	; X64-LABEL: or8_nouse_seq_cst:
	; X64: # %bb.0:
	; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
	; X64-NEXT: retq
	;
	; X86-GENERIC-LABEL: or8_nouse_seq_cst:
	; X86-GENERIC: # %bb.0:
	; X86-GENERIC-NEXT: lock orl $0, (%esp)
	; X86-GENERIC-NEXT: retl
	;
	; X86-ATOM-LABEL: or8_nouse_seq_cst:
	; X86-ATOM: # %bb.0:
	; X86-ATOM-NEXT: lock orl $0, (%esp)
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: nop
	; X86-ATOM-NEXT: retl
	atomicrmw or i8* %p, i8 0 seq_cst
	ret void
	}