blob: 023fb5065b892f4367f42e8f2edd6090ba14d3a4 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers
;
; i32 bt/btc/btr/bts + init (reference)
;
define i1 @test_eq_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_eq_i32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: retl
;
; X64-LABEL: test_eq_i32:
; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setae %al
; X64-NEXT: retq
%rem = and i32 %position, 31
%bit = shl nuw i32 1, %rem
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%cmp = icmp eq i32 %test, 0
ret i1 %cmp
}
define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i32:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: btcl %eax, %esi
; X86-NEXT: btl %eax, %edx
; X86-NEXT: setb %al
; X86-NEXT: movl %esi, (%ecx)
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i32:
; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: btcl %esi, %ecx
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
%ofs = and i32 %position, 31
%bit = shl nuw i32 1, %ofs
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%res = xor i32 %ld, %bit
%cmp = icmp ne i32 %test, 0
store i32 %res, ptr %word
ret i1 %cmp
}
define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i32:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: btrl %eax, %esi
; X86-NEXT: btl %eax, %edx
; X86-NEXT: setae %al
; X86-NEXT: movl %esi, (%ecx)
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i32:
; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: btrl %esi, %ecx
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setae %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
%ofs = and i32 %position, 31
%bit = shl nuw i32 1, %ofs
%mask = xor i32 %bit, -1
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%res = and i32 %ld, %mask
%cmp = icmp eq i32 %test, 0
store i32 %res, ptr %word
ret i1 %cmp
}
define i1 @set_ne_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i32:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: btsl %eax, %esi
; X86-NEXT: btl %eax, %edx
; X86-NEXT: setb %al
; X86-NEXT: movl %esi, (%ecx)
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i32:
; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: btsl %esi, %ecx
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
%ofs = and i32 %position, 31
%bit = shl nuw i32 1, %ofs
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%res = or i32 %ld, %bit
%cmp = icmp ne i32 %test, 0
store i32 %res, ptr %word
ret i1 %cmp
}
define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i32:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll %cl, %eax
; X86-NEXT: movl (%edx), %esi
; X86-NEXT: movl %esi, %edi
; X86-NEXT: btrl %ecx, %edi
; X86-NEXT: orl %eax, %edi
; X86-NEXT: btl %ecx, %esi
; X86-NEXT: setae %al
; X86-NEXT: movl %edi, (%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i32:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: shll %cl, %edx
; SSE-NEXT: movl (%rdi), %eax
; SSE-NEXT: movl %eax, %esi
; SSE-NEXT: btrl %ecx, %esi
; SSE-NEXT: orl %edx, %esi
; SSE-NEXT: btl %ecx, %eax
; SSE-NEXT: setae %al
; SSE-NEXT: movl %esi, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: init_eq_i32:
; AVX: # %bb.0:
; AVX-NEXT: shlxl %esi, %edx, %eax
; AVX-NEXT: movl (%rdi), %ecx
; AVX-NEXT: movl %ecx, %edx
; AVX-NEXT: btrl %esi, %edx
; AVX-NEXT: orl %eax, %edx
; AVX-NEXT: btl %esi, %ecx
; AVX-NEXT: setae %al
; AVX-NEXT: movl %edx, (%rdi)
; AVX-NEXT: retq
%ofs = and i32 %position, 31
%bit = shl nuw i32 1, %ofs
%mask = xor i32 %bit, -1
%val0 = zext i1 %value to i32
%val = shl nuw i32 %val0, %ofs
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%res0 = and i32 %ld, %mask
%res = or i32 %res0, %val
%cmp = icmp eq i32 %test, 0
store i32 %res, ptr %word
ret i1 %cmp
}
;
; i64 bt/btc/btr/bts + init
;
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $32, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: movl (%eax,%edx), %eax
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setb %al
; X64-NEXT: retq
%rem = and i32 %position, 63
%ofs = zext nneg i32 %rem to i64
%bit = shl nuw i64 1, %ofs
%ld = load i64, ptr %word
%test = and i64 %ld, %bit
%cmp = icmp ne i64 %test, 0
ret i1 %cmp
}
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $32, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btcl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: btcq %rsi, %rcx
; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setb %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%rem = and i32 %position, 63
%ofs = zext nneg i32 %rem to i64
%bit = shl nuw i64 1, %ofs
%ld = load i64, ptr %word
%test = and i64 %ld, %bit
%res = xor i64 %ld, %bit
%cmp = icmp ne i64 %test, 0
store i64 %res, ptr %word
ret i1 %cmp
}
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $32, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: btrq %rsi, %rcx
; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setae %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%rem = and i32 %position, 63
%ofs = zext nneg i32 %rem to i64
%bit = shl nuw i64 1, %ofs
%mask = xor i64 %bit, -1
%ld = load i64, ptr %word
%test = and i64 %ld, %bit
%res = and i64 %ld, %mask
%cmp = icmp eq i64 %test, 0
store i64 %res, ptr %word
ret i1 %cmp
}
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $32, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btsl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: btsq %rsi, %rcx
; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setb %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%rem = and i32 %position, 63
%ofs = zext nneg i32 %rem to i64
%bit = shl nuw i64 1, %ofs
%ld = load i64, ptr %word
%test = and i64 %ld, %bit
%res = or i64 %ld, %bit
%cmp = icmp ne i64 %test, 0
store i64 %res, ptr %word
ret i1 %cmp
}
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: andl $32, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%edx,%esi), %edi
; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %ecx, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: movl %edx, %eax
; SSE-NEXT: shlq %cl, %rax
; SSE-NEXT: movq (%rdi), %rdx
; SSE-NEXT: movq %rdx, %rsi
; SSE-NEXT: btrq %rcx, %rsi
; SSE-NEXT: orq %rax, %rsi
; SSE-NEXT: btq %rcx, %rdx
; SSE-NEXT: setae %al
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: init_eq_i64:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $esi killed $esi def $rsi
; AVX-NEXT: movl %edx, %eax
; AVX-NEXT: shlxq %rsi, %rax, %rax
; AVX-NEXT: movq (%rdi), %rcx
; AVX-NEXT: movq %rcx, %rdx
; AVX-NEXT: btrq %rsi, %rdx
; AVX-NEXT: orq %rax, %rdx
; AVX-NEXT: btq %rsi, %rcx
; AVX-NEXT: setae %al
; AVX-NEXT: movq %rdx, (%rdi)
; AVX-NEXT: retq
%rem = and i32 %position, 63
%ofs = zext nneg i32 %rem to i64
%bit = shl nuw i64 1, %ofs
%mask = xor i64 %bit, -1
%val0 = zext i1 %value to i64
%val = shl nuw i64 %val0, %ofs
%ld = load i64, ptr %word
%test = and i64 %ld, %bit
%res0 = and i64 %ld, %mask
%res = or i64 %res0, %val
%cmp = icmp eq i64 %test, 0
store i64 %res, ptr %word
ret i1 %cmp
}
;
; i128
;
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $96, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: movl (%eax,%edx), %eax
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i128:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
; X64-NEXT: andl $96, %eax
; X64-NEXT: shrl $3, %eax
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%ld = load i128, ptr %word
%test = and i128 %ld, %bit
%cmp = icmp ne i128 %test, 0
ret i1 %cmp
}
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $96, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btcl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i128:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setb %al
; X64-NEXT: btcl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%ld = load i128, ptr %word
%test = and i128 %ld, %bit
%res = xor i128 %ld, %bit
%cmp = icmp ne i128 %test, 0
store i128 %res, ptr %word
ret i1 %cmp
}
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $96, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i128:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setae %al
; X64-NEXT: btrl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%mask = xor i128 %bit, -1
%ld = load i128, ptr %word
%test = and i128 %ld, %bit
%res = and i128 %ld, %mask
%cmp = icmp eq i128 %test, 0
store i128 %res, ptr %word
ret i1 %cmp
}
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $96, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btsl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i128:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setb %al
; X64-NEXT: btsl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%ld = load i128, ptr %word
%test = and i128 %ld, %bit
%res = or i128 %ld, %bit
%cmp = icmp ne i128 %test, 0
store i128 %res, ptr %word
ret i1 %cmp
}
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: andl $96, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%edx,%esi), %edi
; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %ecx, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $96, %esi
; SSE-NEXT: shrl $3, %esi
; SSE-NEXT: movl (%rdi,%rsi), %r8d
; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
; SSE-NEXT: shll %cl, %edx
; SSE-NEXT: btrl %ecx, %r8d
; SSE-NEXT: orl %r8d, %edx
; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: init_eq_i128:
; AVX: # %bb.0:
; AVX-NEXT: movl %esi, %ecx
; AVX-NEXT: andl $96, %ecx
; AVX-NEXT: shrl $3, %ecx
; AVX-NEXT: movl (%rdi,%rcx), %r8d
; AVX-NEXT: btl %esi, %r8d
; AVX-NEXT: setae %al
; AVX-NEXT: btrl %esi, %r8d
; AVX-NEXT: shlxl %esi, %edx, %edx
; AVX-NEXT: orl %r8d, %edx
; AVX-NEXT: movl %edx, (%rdi,%rcx)
; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%mask = xor i128 %bit, -1
%val0 = zext i1 %value to i128
%val = shl nuw i128 %val0, %ofs
%ld = load i128, ptr %word
%test = and i128 %ld, %bit
%res0 = and i128 %ld, %mask
%res = or i128 %res0, %val
%cmp = icmp eq i128 %test, 0
store i128 %res, ptr %word
ret i1 %cmp
}
; i512
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: andl $60, %edx
; X86-NEXT: movl (%eax,%edx), %eax
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i512:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
; X64-NEXT: shrl $3, %eax
; X64-NEXT: andl $60, %eax
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
%cmp = icmp ne i512 %test, 0
ret i1 %cmp
}
define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i512:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: andl $60, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btcl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i512:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: andl $60, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setb %al
; X64-NEXT: btcl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
%res = xor i512 %ld, %bit
%cmp = icmp ne i512 %test, 0
store i512 %res, ptr %word
ret i1 %cmp
}
define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i512:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: andl $60, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i512:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: andl $60, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setae %al
; X64-NEXT: btrl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%mask = xor i512 %bit, -1
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
%res = and i512 %ld, %mask
%cmp = icmp eq i512 %test, 0
store i512 %res, ptr %word
ret i1 %cmp
}
define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i512:
; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: andl $60, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: btl %edx, %edi
; X86-NEXT: setb %al
; X86-NEXT: btsl %edx, %edi
; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i512:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: andl $60, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
; X64-NEXT: btl %esi, %edx
; X64-NEXT: setb %al
; X64-NEXT: btsl %esi, %edx
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
%res = or i512 %ld, %bit
%cmp = icmp ne i512 %test, 0
store i512 %res, ptr %word
ret i1 %cmp
}
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: andl $60, %esi
; X86-NEXT: movl (%edx,%esi), %edi
; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
; X86-NEXT: btrl %ecx, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: shrl $3, %esi
; SSE-NEXT: andl $60, %esi
; SSE-NEXT: movl (%rdi,%rsi), %r8d
; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
; SSE-NEXT: shll %cl, %edx
; SSE-NEXT: btrl %ecx, %r8d
; SSE-NEXT: orl %r8d, %edx
; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: init_eq_i512:
; AVX: # %bb.0:
; AVX-NEXT: movl %esi, %ecx
; AVX-NEXT: shrl $3, %ecx
; AVX-NEXT: andl $60, %ecx
; AVX-NEXT: movl (%rdi,%rcx), %r8d
; AVX-NEXT: btl %esi, %r8d
; AVX-NEXT: setae %al
; AVX-NEXT: btrl %esi, %r8d
; AVX-NEXT: shlxl %esi, %edx, %edx
; AVX-NEXT: orl %r8d, %edx
; AVX-NEXT: movl %edx, (%rdi,%rcx)
; AVX-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%mask = xor i512 %bit, -1
%val0 = zext i1 %value to i512
%val = shl nuw i512 %val0, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
%res0 = and i512 %ld, %mask
%res = or i512 %res0, %val
%cmp = icmp eq i512 %test, 0
store i512 %res, ptr %word
ret i1 %cmp
}
; i4096
define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i4096:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: andl $4064, %edx # imm = 0xFE0
; X86-NEXT: shrl $3, %edx
; X86-NEXT: movl (%eax,%edx), %eax
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i4096:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
; X64-NEXT: andl $4064, %eax # imm = 0xFE0
; X64-NEXT: shrl $3, %eax
; X64-NEXT: movl (%rdi,%rax), %eax
; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: retq
%rem = and i32 %position, 4095
%ofs = zext nneg i32 %rem to i4096
%bit = shl nuw i4096 1, %ofs
%ld = load i4096, ptr %word
%test = and i4096 %ld, %bit
%cmp = icmp ne i4096 %test, 0
ret i1 %cmp
}
; Special Cases
; Multiple uses of the stored value
define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_cmpz_i128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $1, %edx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: andl $96, %ecx
; X86-NEXT: shrl $3, %ecx
; X86-NEXT: xorl %edx, (%eax,%ecx)
; X86-NEXT: movl (%eax), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: orl 12(%eax), %edx
; X86-NEXT: orl 8(%eax), %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; SSE-LABEL: complement_cmpz_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: shll %cl, %eax
; SSE-NEXT: andl $96, %ecx
; SSE-NEXT: shrl $3, %ecx
; SSE-NEXT: xorl %eax, (%rdi,%rcx)
; SSE-NEXT: movq (%rdi), %rax
; SSE-NEXT: orq 8(%rdi), %rax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; AVX-LABEL: complement_cmpz_i128:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $esi killed $esi def $rsi
; AVX-NEXT: movl $1, %eax
; AVX-NEXT: shlxl %esi, %eax, %eax
; AVX-NEXT: andl $96, %esi
; AVX-NEXT: shrl $3, %esi
; AVX-NEXT: xorl %eax, (%rdi,%rsi)
; AVX-NEXT: movq (%rdi), %rax
; AVX-NEXT: orq 8(%rdi), %rax
; AVX-NEXT: setne %al
; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%ld = load i128, ptr %word
%res = xor i128 %ld, %bit
store i128 %res, ptr %word
%cmp = icmp ne i128 %res, 0
ret i1 %cmp
}
; Load hidden behind bitcast
define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128_bitcast:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $80, %esp
; X86-NEXT: movl 12(%ebp), %eax
; X86-NEXT: movzwl (%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzwl 12(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzwl 14(%eax), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll $16, %edi
; X86-NEXT: orl %ecx, %edi
; X86-NEXT: movzwl 2(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzwl 4(%eax), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzwl 6(%eax), %esi
; X86-NEXT: movzwl 8(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzwl 10(%eax), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: shll $16, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl %esi, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $16, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: andb $96, %bl
; X86-NEXT: shrb $3, %bl
; X86-NEXT: movzbl %bl, %edi
; X86-NEXT: movl 32(%esp,%edi), %edi
; X86-NEXT: btcl %eax, %edi
; X86-NEXT: andl $96, %eax
; X86-NEXT: shrl $3, %eax
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movl %edi, (%ecx,%eax)
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movw %dx, 14(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movw %dx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 10(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 8(%eax)
; X86-NEXT: movw %si, 6(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 4(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 2(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; SSE2-LABEL: complement_ne_i128_bitcast:
; SSE2: # %bb.0:
; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movq 8(%rdi), %rax
; SSE2-NEXT: movq %xmm0, %rdx
; SSE2-NEXT: movl %esi, %ecx
; SSE2-NEXT: andb $32, %cl
; SSE2-NEXT: shrdq %cl, %rax, %rdx
; SSE2-NEXT: shrq %cl, %rax
; SSE2-NEXT: testb $64, %sil
; SSE2-NEXT: cmoveq %rdx, %rax
; SSE2-NEXT: btcl %esi, %eax
; SSE2-NEXT: andl $96, %esi
; SSE2-NEXT: shrl $3, %esi
; SSE2-NEXT: movl %eax, (%rdi,%rsi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: complement_ne_i128_bitcast:
; SSE4: # %bb.0:
; SSE4-NEXT: # kill: def $esi killed $esi def $rsi
; SSE4-NEXT: movdqa (%rdi), %xmm0
; SSE4-NEXT: pextrq $1, %xmm0, %rax
; SSE4-NEXT: movq %xmm0, %rdx
; SSE4-NEXT: movl %esi, %ecx
; SSE4-NEXT: andb $32, %cl
; SSE4-NEXT: shrdq %cl, %rax, %rdx
; SSE4-NEXT: shrq %cl, %rax
; SSE4-NEXT: testb $64, %sil
; SSE4-NEXT: cmoveq %rdx, %rax
; SSE4-NEXT: btcl %esi, %eax
; SSE4-NEXT: andl $96, %esi
; SSE4-NEXT: shrl $3, %esi
; SSE4-NEXT: movl %eax, (%rdi,%rsi)
; SSE4-NEXT: retq
;
; AVX-LABEL: complement_ne_i128_bitcast:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $esi killed $esi def $rsi
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vmovq %xmm0, %rdx
; AVX-NEXT: movl %esi, %ecx
; AVX-NEXT: andb $32, %cl
; AVX-NEXT: shrdq %cl, %rax, %rdx
; AVX-NEXT: shrxq %rcx, %rax, %rax
; AVX-NEXT: testb $64, %sil
; AVX-NEXT: cmoveq %rdx, %rax
; AVX-NEXT: btcl %esi, %eax
; AVX-NEXT: andl $96, %esi
; AVX-NEXT: shrl $3, %esi
; AVX-NEXT: movl %eax, (%rdi,%rsi)
; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%ldv = load <8 x i16>, ptr %word
%ld = bitcast <8 x i16> %ldv to i128
%test = and i128 %ld, %bit
%res = xor i128 %ld, %bit
store i128 %res, ptr %word
ret <8 x i16> %ldv
}
; Multiple loads in store chain
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: movl %edx, %esi
; X86-NEXT: andl $96, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%ecx,%esi), %edi
; X86-NEXT: movl %edi, %ebx
; X86-NEXT: btrl %edx, %ebx
; X86-NEXT: btl %edx, %edi
; X86-NEXT: movl %ebx, (%ecx,%esi)
; X86-NEXT: jae .LBB23_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB23_2:
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: reset_multiload_i128:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %r9d
; X64-NEXT: movl %r9d, %r8d
; X64-NEXT: btrl %esi, %r8d
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: btl %esi, %r9d
; X64-NEXT: jb .LBB23_2
; X64-NEXT: # %bb.1:
; X64-NEXT: movl (%rdx), %eax
; X64-NEXT: .LBB23_2:
; X64-NEXT: movl %r8d, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
%mask = xor i128 %bit, -1
%ld = load i128, ptr %word
%sel = load i32, ptr %p
%test = and i128 %ld, %bit
%res = and i128 %ld, %mask
%cmp = icmp eq i128 %test, 0
store i128 %res, ptr %word
%ret = select i1 %cmp, i32 %sel, i32 0
ret i32 %ret
}
; Multiple uses of the store chain AND stored value
define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
; X86-LABEL: chain_reset_i256:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $-2, %edi
; X86-NEXT: roll %cl, %edi
; X86-NEXT: shrl $3, %ecx
; X86-NEXT: andl $28, %ecx
; X86-NEXT: andl %edi, (%esi,%ecx)
; X86-NEXT: movl 8(%esi), %ebx
; X86-NEXT: movl (%esi), %edi
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 12(%esi), %ebp
; X86-NEXT: orl 28(%esi), %ebp
; X86-NEXT: orl 20(%esi), %ecx
; X86-NEXT: orl %ebp, %ecx
; X86-NEXT: orl 24(%esi), %ebx
; X86-NEXT: movl 16(%esi), %ebp
; X86-NEXT: orl %edi, %ebp
; X86-NEXT: orl %ebx, %ebp
; X86-NEXT: movl (%edx), %esi
; X86-NEXT: movl %edi, (%edx)
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: orl %ecx, %ebp
; X86-NEXT: jne .LBB24_2
; X86-NEXT: # %bb.1:
; X86-NEXT: addl %esi, %eax
; X86-NEXT: .LBB24_2:
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: chain_reset_i256:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
; SSE-NEXT: movl $-2, %eax
; SSE-NEXT: roll %cl, %eax
; SSE-NEXT: shrl $3, %ecx
; SSE-NEXT: andl $28, %ecx
; SSE-NEXT: andl %eax, (%rdi,%rcx)
; SSE-NEXT: movq (%rdi), %rcx
; SSE-NEXT: movq 8(%rdi), %r8
; SSE-NEXT: orq 24(%rdi), %r8
; SSE-NEXT: movq 16(%rdi), %rdi
; SSE-NEXT: orq %rcx, %rdi
; SSE-NEXT: movl (%rsi), %eax
; SSE-NEXT: movl %ecx, (%rsi)
; SSE-NEXT: movl (%rdx), %ecx
; SSE-NEXT: addl %ecx, %eax
; SSE-NEXT: orq %r8, %rdi
; SSE-NEXT: cmovnel %ecx, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: chain_reset_i256:
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
; AVX-NEXT: movl $-2, %eax
; AVX-NEXT: roll %cl, %eax
; AVX-NEXT: shrl $3, %ecx
; AVX-NEXT: andl $28, %ecx
; AVX-NEXT: andl %eax, (%rdi,%rcx)
; AVX-NEXT: vmovdqu (%rdi), %ymm0
; AVX-NEXT: movl (%rdi), %ecx
; AVX-NEXT: movl (%rsi), %eax
; AVX-NEXT: movl %ecx, (%rsi)
; AVX-NEXT: movl (%rdx), %ecx
; AVX-NEXT: addl %ecx, %eax
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: cmovnel %ecx, %eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%rem = and i32 %position, 255
%ofs = zext nneg i32 %rem to i256
%bit = shl nuw i256 1, %ofs
%ld0 = load i256, ptr %p0
%msk = xor i256 %bit, -1
%res = and i256 %ld0, %msk
store i256 %res, ptr %p0
%cmp = icmp ne i256 %res, 0
%ld1 = load i32, ptr %p1
%trunc = trunc i256 %res to i32
store i32 %trunc, ptr %p1
%ld2 = load i32, ptr %p2
%add = add i32 %ld1, %ld2
%sel = select i1 %cmp, i32 %ld2, i32 %add
ret i32 %sel
}
; BTC/BT/BTS sequence on same i128
define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
; X86-LABEL: sequence_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $144, %esp
; X86-NEXT: movb 20(%ebp), %ch
; X86-NEXT: movb 12(%ebp), %cl
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
; X86-NEXT: movl 56(%esp,%eax), %edx
; X86-NEXT: movl 60(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 48(%esp,%eax), %edi
; X86-NEXT: movl 52(%esp,%eax), %ebx
; X86-NEXT: shldl %cl, %ebx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %edi, %ebx
; X86-NEXT: shll %cl, %edi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movb %ch, %al
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
; X86-NEXT: movl 84(%esp,%eax), %edx
; X86-NEXT: movl 88(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl 20(%ebp), %ecx
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 80(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 92(%esp,%eax), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
; X86-NEXT: shll %cl, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %esi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: xorl 8(%eax), %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: xorl 12(%eax), %esi
; X86-NEXT: xorl (%eax), %edi
; X86-NEXT: xorl 4(%eax), %ebx
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: andb $96, %al
; X86-NEXT: shrb $3, %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movl 96(%esp,%eax), %eax
; X86-NEXT: movl 16(%ebp), %ecx
; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movl %edx, 8(%ecx)
; X86-NEXT: movl %esi, 12(%ecx)
; X86-NEXT: movl %edi, (%ecx)
; X86-NEXT: movl %ebx, 4(%ecx)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: sequence_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %ecx, %eax
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: movl $1, %r8d
; SSE-NEXT: xorl %esi, %esi
; SSE-NEXT: shldq %cl, %r8, %rsi
; SSE-NEXT: movl $1, %r9d
; SSE-NEXT: shlq %cl, %r9
; SSE-NEXT: xorl %r11d, %r11d
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %r9, %rsi
; SSE-NEXT: cmovneq %r11, %r9
; SSE-NEXT: xorl %r10d, %r10d
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shldq %cl, %r8, %r10
; SSE-NEXT: shlq %cl, %r8
; SSE-NEXT: testb $64, %al
; SSE-NEXT: cmovneq %r8, %r10
; SSE-NEXT: cmovneq %r11, %r8
; SSE-NEXT: xorq 8(%rdi), %rsi
; SSE-NEXT: xorq (%rdi), %r9
; SSE-NEXT: movl %edx, %ecx
; SSE-NEXT: andb $32, %cl
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: shrdq %cl, %rsi, %rax
; SSE-NEXT: movq %rsi, %r11
; SSE-NEXT: shrq %cl, %r11
; SSE-NEXT: testb $64, %dl
; SSE-NEXT: cmoveq %rax, %r11
; SSE-NEXT: btl %edx, %r11d
; SSE-NEXT: setae %al
; SSE-NEXT: orq %r10, %rsi
; SSE-NEXT: orq %r8, %r9
; SSE-NEXT: movq %r9, (%rdi)
; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: retq
;
; AVX2-LABEL: sequence_i128:
; AVX2: # %bb.0:
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: movl %esi, %ecx
; AVX2-NEXT: xorl %r9d, %r9d
; AVX2-NEXT: movl $1, %r10d
; AVX2-NEXT: xorl %esi, %esi
; AVX2-NEXT: shldq %cl, %r10, %rsi
; AVX2-NEXT: shlxq %rcx, %r10, %r8
; AVX2-NEXT: testb $64, %cl
; AVX2-NEXT: cmovneq %r8, %rsi
; AVX2-NEXT: cmovneq %r9, %r8
; AVX2-NEXT: xorl %r11d, %r11d
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shldq %cl, %r10, %r11
; AVX2-NEXT: shlxq %rax, %r10, %r10
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: cmovneq %r10, %r11
; AVX2-NEXT: cmovneq %r9, %r10
; AVX2-NEXT: xorq 8(%rdi), %rsi
; AVX2-NEXT: xorq (%rdi), %r8
; AVX2-NEXT: movl %edx, %ecx
; AVX2-NEXT: andb $32, %cl
; AVX2-NEXT: movq %r8, %rax
; AVX2-NEXT: shrdq %cl, %rsi, %rax
; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
; AVX2-NEXT: testb $64, %dl
; AVX2-NEXT: cmoveq %rax, %rcx
; AVX2-NEXT: btl %edx, %ecx
; AVX2-NEXT: setae %al
; AVX2-NEXT: orq %r11, %rsi
; AVX2-NEXT: orq %r10, %r8
; AVX2-NEXT: movq %r8, (%rdi)
; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: sequence_i128:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: movl %esi, %ecx
; AVX512-NEXT: movl $1, %r9d
; AVX512-NEXT: xorl %esi, %esi
; AVX512-NEXT: shldq %cl, %r9, %rsi
; AVX512-NEXT: xorl %r10d, %r10d
; AVX512-NEXT: shlxq %rcx, %r9, %r8
; AVX512-NEXT: testb $64, %cl
; AVX512-NEXT: cmovneq %r8, %rsi
; AVX512-NEXT: cmovneq %r10, %r8
; AVX512-NEXT: xorl %r11d, %r11d
; AVX512-NEXT: movl %eax, %ecx
; AVX512-NEXT: shldq %cl, %r9, %r11
; AVX512-NEXT: shlxq %rax, %r9, %r9
; AVX512-NEXT: testb $64, %al
; AVX512-NEXT: cmovneq %r9, %r11
; AVX512-NEXT: cmovneq %r10, %r9
; AVX512-NEXT: xorq 8(%rdi), %rsi
; AVX512-NEXT: xorq (%rdi), %r8
; AVX512-NEXT: movl %edx, %ecx
; AVX512-NEXT: andb $32, %cl
; AVX512-NEXT: movq %r8, %rax
; AVX512-NEXT: shrdq %cl, %rsi, %rax
; AVX512-NEXT: shrxq %rcx, %rsi, %rcx
; AVX512-NEXT: testb $64, %dl
; AVX512-NEXT: cmoveq %rax, %rcx
; AVX512-NEXT: btl %edx, %ecx
; AVX512-NEXT: setae %al
; AVX512-NEXT: orq %r11, %rsi
; AVX512-NEXT: orq %r9, %r8
; AVX512-NEXT: movq %r8, (%rdi)
; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: retq
%rem0 = and i32 %pos0, 127
%rem1 = and i32 %pos1, 127
%rem2 = and i32 %pos2, 127
%ofs0 = zext nneg i32 %rem0 to i128
%ofs1 = zext nneg i32 %rem1 to i128
%ofs2 = zext nneg i32 %rem2 to i128
%bit0 = shl nuw i128 1, %ofs0
%bit1 = shl nuw i128 1, %ofs1
%bit2 = shl nuw i128 1, %ofs2
%ld = load i128, ptr %word
%res0 = xor i128 %ld, %bit0
%test1 = and i128 %res0, %bit1
%cmp1 = icmp eq i128 %test1, 0
%res2 = or i128 %res0, %bit2
store i128 %res2, ptr %word
ret i1 %cmp1
}
define i32 @blsr_u512(ptr %word) nounwind {
; X86-LABEL: blsr_u512:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $240, %esp
; X86-NEXT: movl 8(%ebp), %ebx
; X86-NEXT: movl 12(%ebx), %esi
; X86-NEXT: movl 28(%ebx), %eax
; X86-NEXT: movl 60(%ebx), %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl 44(%ebx), %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl 20(%ebx), %edx
; X86-NEXT: movl 52(%ebx), %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %eax, %edx
; X86-NEXT: movl 4(%ebx), %edi
; X86-NEXT: movl 36(%ebx), %esi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: movl 24(%ebx), %edx
; X86-NEXT: movl 56(%ebx), %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ecx, %edx
; X86-NEXT: movl 8(%ebx), %ecx
; X86-NEXT: movl 40(%ebx), %esi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl 16(%ebx), %edx
; X86-NEXT: movl 48(%ebx), %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: movl (%ebx), %esi
; X86-NEXT: movl 32(%ebx), %ebx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: orl %ecx, %esi
; X86-NEXT: orl %eax, %esi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: je .LBB26_1
; X86-NEXT: # %bb.2: # %cond.false
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB26_3
; X86-NEXT: # %bb.4: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB26_5
; X86-NEXT: .LBB26_1:
; X86-NEXT: movl $512, %ecx # imm = 0x200
; X86-NEXT: jmp .LBB26_41
; X86-NEXT: .LBB26_3:
; X86-NEXT: rep bsfl %ebx, %eax
; X86-NEXT: .LBB26_5: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: jne .LBB26_6
; X86-NEXT: # %bb.7: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: addl $32, %ecx
; X86-NEXT: jmp .LBB26_8
; X86-NEXT: .LBB26_6:
; X86-NEXT: rep bsfl %ecx, %ecx
; X86-NEXT: .LBB26_8: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: jne .LBB26_10
; X86-NEXT: # %bb.9: # %cond.false
; X86-NEXT: addl $64, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB26_10: # %cond.false
; X86-NEXT: testl %esi, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: jne .LBB26_11
; X86-NEXT: # %bb.12: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: je .LBB26_15
; X86-NEXT: .LBB26_14:
; X86-NEXT: rep bsfl %edx, %edx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: je .LBB26_17
; X86-NEXT: jmp .LBB26_18
; X86-NEXT: .LBB26_11:
; X86-NEXT: rep bsfl %esi, %ecx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: jne .LBB26_14
; X86-NEXT: .LBB26_15: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: addl $32, %edx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: jne .LBB26_18
; X86-NEXT: .LBB26_17: # %cond.false
; X86-NEXT: addl $64, %edx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: .LBB26_18: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: orl %edx, %esi
; X86-NEXT: jne .LBB26_20
; X86-NEXT: # %bb.19: # %cond.false
; X86-NEXT: subl $-128, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB26_20: # %cond.false
; X86-NEXT: addl $256, %eax # imm = 0x100
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: testl %edx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: jne .LBB26_21
; X86-NEXT: # %bb.22: # %cond.false
; X86-NEXT: rep bsfl %edi, %ebx
; X86-NEXT: addl $32, %ebx
; X86-NEXT: jmp .LBB26_23
; X86-NEXT: .LBB26_21:
; X86-NEXT: rep bsfl %edx, %ebx
; X86-NEXT: .LBB26_23: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: jne .LBB26_24
; X86-NEXT: # %bb.25: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: addl $32, %ecx
; X86-NEXT: orl %edi, %edx
; X86-NEXT: je .LBB26_27
; X86-NEXT: jmp .LBB26_28
; X86-NEXT: .LBB26_24:
; X86-NEXT: rep bsfl %ecx, %ecx
; X86-NEXT: orl %edi, %edx
; X86-NEXT: jne .LBB26_28
; X86-NEXT: .LBB26_27: # %cond.false
; X86-NEXT: addl $64, %ecx
; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: .LBB26_28: # %cond.false
; X86-NEXT: testl %esi, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: jne .LBB26_29
; X86-NEXT: # %bb.30: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: je .LBB26_33
; X86-NEXT: .LBB26_32:
; X86-NEXT: rep bsfl %edx, %edx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: je .LBB26_35
; X86-NEXT: jmp .LBB26_36
; X86-NEXT: .LBB26_29:
; X86-NEXT: rep bsfl %esi, %ecx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: jne .LBB26_32
; X86-NEXT: .LBB26_33: # %cond.false
; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: addl $32, %edx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: jne .LBB26_36
; X86-NEXT: .LBB26_35: # %cond.false
; X86-NEXT: addl $64, %edx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: .LBB26_36: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edi, %esi
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: orl %edx, %esi
; X86-NEXT: jne .LBB26_38
; X86-NEXT: # %bb.37: # %cond.false
; X86-NEXT: subl $-128, %ecx
; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: .LBB26_38: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl %ecx, %edx
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: orl %ecx, %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: jne .LBB26_40
; X86-NEXT: # %bb.39: # %cond.false
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: .LBB26_40: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .LBB26_41: # %cond.end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: andl $60, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $31, %ecx
; X86-NEXT: movl 56(%edx), %edi
; X86-NEXT: movl 60(%edx), %esi
; X86-NEXT: shldl %cl, %edi, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 52(%edx), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %esi, %edi
; X86-NEXT: notl %edi
; X86-NEXT: andl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 40(%edx), %eax
; X86-NEXT: movl 44(%edx), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 36(%edx), %esi
; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 32(%edx), %eax
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 28(%edx), %esi
; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 24(%edx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 4(%edx), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 8(%edx), %eax
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shldl %cl, %esi, %edx
; X86-NEXT: notl %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 12(%ebx), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 16(%ebx), %eax
; X86-NEXT: movl %eax, %esi
; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 20(%ebx), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: negl %eax
; X86-NEXT: movl 208(%esp,%eax), %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: notl %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl (%ebx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll %cl, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl %cl, %edi, %ebx
; X86-NEXT: notl %ebx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, %edi
; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, 24(%ecx)
; X86-NEXT: movl %esi, 20(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 16(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 12(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 8(%ecx)
; X86-NEXT: movl %edi, 4(%ecx)
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 28(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 32(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 36(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 40(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 44(%ecx)
; X86-NEXT: movl %edx, 48(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 52(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 56(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, 60(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: blsr_u512:
; SSE: # %bb.0:
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: movq 48(%rdi), %r11
; SSE-NEXT: movq 40(%rdi), %r9
; SSE-NEXT: movq 24(%rdi), %r8
; SSE-NEXT: movq 16(%rdi), %rdx
; SSE-NEXT: movq (%rdi), %rcx
; SSE-NEXT: movq 8(%rdi), %rsi
; SSE-NEXT: rep bsfq %rcx, %rax
; SSE-NEXT: rep bsfq %rsi, %rbx
; SSE-NEXT: addq $64, %rbx
; SSE-NEXT: testq %rcx, %rcx
; SSE-NEXT: cmovneq %rax, %rbx
; SSE-NEXT: rep bsfq %rdx, %rax
; SSE-NEXT: rep bsfq %r8, %r10
; SSE-NEXT: addq $64, %r10
; SSE-NEXT: testq %rdx, %rdx
; SSE-NEXT: cmovneq %rax, %r10
; SSE-NEXT: movq 32(%rdi), %r14
; SSE-NEXT: subq $-128, %r10
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: orq %rsi, %rax
; SSE-NEXT: cmovneq %rbx, %r10
; SSE-NEXT: rep bsfq %r14, %rax
; SSE-NEXT: rep bsfq %r9, %rbx
; SSE-NEXT: addq $64, %rbx
; SSE-NEXT: testq %r14, %r14
; SSE-NEXT: cmovneq %rax, %rbx
; SSE-NEXT: rep bsfq %r11, %r15
; SSE-NEXT: movl $64, %eax
; SSE-NEXT: rep bsfq 56(%rdi), %rax
; SSE-NEXT: addq $64, %rax
; SSE-NEXT: testq %r11, %r11
; SSE-NEXT: cmovneq %r15, %rax
; SSE-NEXT: subq $-128, %rax
; SSE-NEXT: orq %r9, %r14
; SSE-NEXT: cmovneq %rbx, %rax
; SSE-NEXT: addq $256, %rax # imm = 0x100
; SSE-NEXT: orq %r8, %rsi
; SSE-NEXT: orq %rdx, %rcx
; SSE-NEXT: orq %rsi, %rcx
; SSE-NEXT: cmovneq %r10, %rax
; SSE-NEXT: movl $-2, %edx
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: roll %cl, %edx
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shrl $3, %ecx
; SSE-NEXT: andl $60, %ecx
; SSE-NEXT: andl %edx, (%rdi,%rcx)
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX2-LABEL: blsr_u512:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: movq 40(%rdi), %r9
; AVX2-NEXT: movq 32(%rdi), %r10
; AVX2-NEXT: movq 24(%rdi), %r8
; AVX2-NEXT: movq 16(%rdi), %rdx
; AVX2-NEXT: movq (%rdi), %rcx
; AVX2-NEXT: movq 8(%rdi), %rsi
; AVX2-NEXT: tzcntq %rcx, %rax
; AVX2-NEXT: xorl %ebx, %ebx
; AVX2-NEXT: tzcntq %rsi, %rbx
; AVX2-NEXT: addq $64, %rbx
; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: cmovneq %rax, %rbx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq %rdx, %rax
; AVX2-NEXT: tzcntq %r8, %r11
; AVX2-NEXT: addq $64, %r11
; AVX2-NEXT: testq %rdx, %rdx
; AVX2-NEXT: cmovneq %rax, %r11
; AVX2-NEXT: subq $-128, %r11
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: orq %rsi, %rax
; AVX2-NEXT: cmovneq %rbx, %r11
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq %r10, %rax
; AVX2-NEXT: xorl %ebx, %ebx
; AVX2-NEXT: tzcntq %r9, %rbx
; AVX2-NEXT: addq $64, %rbx
; AVX2-NEXT: testq %r10, %r10
; AVX2-NEXT: cmovneq %rax, %rbx
; AVX2-NEXT: movq 48(%rdi), %r14
; AVX2-NEXT: xorl %r15d, %r15d
; AVX2-NEXT: tzcntq %r14, %r15
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 56(%rdi), %rax
; AVX2-NEXT: addq $64, %rax
; AVX2-NEXT: testq %r14, %r14
; AVX2-NEXT: cmovneq %r15, %rax
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: orq %r9, %r10
; AVX2-NEXT: cmovneq %rbx, %rax
; AVX2-NEXT: addq $256, %rax # imm = 0x100
; AVX2-NEXT: orq %r8, %rsi
; AVX2-NEXT: orq %rdx, %rcx
; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: cmovneq %r11, %rax
; AVX2-NEXT: movl $-2, %edx
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: roll %cl, %edx
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $3, %ecx
; AVX2-NEXT: andl $60, %ecx
; AVX2-NEXT: andl %edx, (%rdi,%rcx)
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512-LABEL: blsr_u512:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vpandnq %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vplzcntq %zmm1, %zmm1
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm1
; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: movl $-2, %edx
; AVX512-NEXT: movl %eax, %ecx
; AVX512-NEXT: roll %cl, %edx
; AVX512-NEXT: movl %eax, %ecx
; AVX512-NEXT: shrl $3, %ecx
; AVX512-NEXT: andl $60, %ecx
; AVX512-NEXT: andl %edx, (%rdi,%rcx)
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%ld = load i512, ptr %word
%tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false)
%tz.cast = trunc nuw nsw i512 %tz to i32
%tz.mask = and i512 %tz, 511
%mask = shl nuw i512 1, %tz.mask
%mask.not = xor i512 %mask, -1
%blsr = and i512 %ld, %mask.not
store i512 %blsr, ptr %word
ret i32 %tz.cast
}