[ExpandMemCmp] Allow overlaping loads in the zero-relational case.

Summary:
This allows doing `memcmp(p, q, 7)` with 2 loads instead of a call to
memcmp.
This fixes part of PR45147.

Reviewers: spatel

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76133
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3fc6154..cca9cdc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3639,6 +3639,8 @@
   TTI::MemCmpExpansionOptions Options;
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = 2;
+  // All GPR and vector loads can be unaligned.
+  Options.AllowOverlappingLoads = true;
   if (IsZeroCmp) {
     // Only enable vector loads for equality comparison. Right now the vector
     // version is not as fast for three way compare (see #33329).
@@ -3646,8 +3648,6 @@
     if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
-    // All GPR and vector loads can be unaligned.
-    Options.AllowOverlappingLoads = true;
   }
   if (ST->is64Bit()) {
     Options.LoadSizes.push_back(8);
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 21aef9c..374f573 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -577,64 +577,53 @@
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
 ; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB19_4
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB19_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzwl 4(%eax), %edx
-; X86-NEXT:    movzwl 4(%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    movzwl %dx, %edx
-; X86-NEXT:    movzwl %si, %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB19_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movzbl 6(%eax), %eax
-; X86-NEXT:    movzbl 6(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB19_4: # %res_block
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB19_3
+; X86-NEXT:  .LBB19_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB19_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB19_4
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    jne .LBB19_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzwl 4(%rdi), %eax
-; X64-NEXT:    movzwl 4(%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB19_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movzbl 6(%rdi), %eax
-; X64-NEXT:    movzbl 6(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB19_4: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    movl 3(%rdi), %ecx
+; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    je .LBB19_3
+; X64-NEXT:  .LBB19_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB19_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
   ret i32 %m
@@ -671,31 +660,25 @@
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
 ; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB21_4
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB21_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzwl 4(%eax), %edx
-; X86-NEXT:    movzwl 4(%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    movzwl %dx, %edx
-; X86-NEXT:    movzwl %si, %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB21_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movzbl 6(%eax), %eax
-; X86-NEXT:    movzbl 6(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB21_3
-; X86-NEXT:  .LBB21_4: # %res_block
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB21_3
+; X86-NEXT:  .LBB21_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB21_3: # %endblock
@@ -706,33 +689,26 @@
 ;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB21_3
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    jne .LBB21_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzwl 4(%rdi), %eax
-; X64-NEXT:    movzwl 4(%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB21_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movzbl 6(%rdi), %eax
-; X64-NEXT:    movzbl 6(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB21_3: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    movl 3(%rdi), %ecx
+; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    je .LBB21_3
+; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB21_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1931,8 +1907,41 @@
 ;
 ; X64-LABEL: length31:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:  # %bb.2: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:  # %bb.3: # %loadbb3
+; X64-NEXT:    movq 23(%rdi), %rcx
+; X64-NEXT:    movq 23(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB43_5
+; X64-NEXT:  .LBB43_4: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB43_5: # %endblock
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 31) nounwind
   ret i32 %m
 }
@@ -2063,12 +2072,42 @@
 ;
 ; X64-LABEL: length31_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB45_4
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB45_4
+; X64-NEXT:  # %bb.2: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB45_4
+; X64-NEXT:  # %bb.3: # %loadbb3
+; X64-NEXT:    movq 23(%rdi), %rcx
+; X64-NEXT:    movq 23(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB45_5
+; X64-NEXT:  .LBB45_4: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB45_5: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 31) nounwind
   %cmp = icmp slt i32 %call, 0
@@ -2090,12 +2129,42 @@
 ;
 ; X64-LABEL: length31_gt:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:  # %bb.2: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:  # %bb.3: # %loadbb3
+; X64-NEXT:    movq 23(%rdi), %rax
+; X64-NEXT:    movq 23(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB46_5
+; X64-NEXT:  .LBB46_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:  .LBB46_5: # %endblock
+; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 31) nounwind
   %cmp = icmp sgt i32 %call, 0
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 64c8b8c..de604de 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -622,18 +622,55 @@
 define i32 @length7(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB21_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB21_3
+; X86-NEXT:  .LBB21_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB21_3: # %endblock
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $7, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    jne .LBB21_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movl 3(%rdi), %ecx
+; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    je .LBB21_3
+; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB21_3: # %endblock
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
   ret i32 %m
 }
@@ -641,24 +678,58 @@
 define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $7
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB22_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB22_3
+; X86-NEXT:  .LBB22_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB22_3: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $7, %edx
-; X64-NEXT:    callq memcmp
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    jne .LBB22_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movl 3(%rdi), %ecx
+; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    je .LBB22_3
+; X64-NEXT:  .LBB22_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB22_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
   %c = icmp slt i32 %m, 0
@@ -998,8 +1069,27 @@
 ;
 ; X64-LABEL: length15:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $15, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB34_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 7(%rdi), %rcx
+; X64-NEXT:    movq 7(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB34_3
+; X64-NEXT:  .LBB34_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB34_3: # %endblock
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
   ret i32 %m
 }
@@ -1019,12 +1109,28 @@
 ;
 ; X64-LABEL: length15_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $15, %edx
-; X64-NEXT:    callq memcmp
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB35_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 7(%rdi), %rcx
+; X64-NEXT:    movq 7(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB35_3
+; X64-NEXT:  .LBB35_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB35_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
   %c = icmp slt i32 %m, 0
@@ -1044,9 +1150,25 @@
 ;
 ; X64-LABEL: length15_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $.L.str+1, %esi
-; X64-NEXT:    movl $15, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64-NEXT:    movabsq $3544952156018063160, %rcx # imm = 0x3132333435363738
+; X64-NEXT:    movq (%rdi), %rdx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    jne .LBB36_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435
+; X64-NEXT:    movq 7(%rdi), %rdx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    je .LBB36_3
+; X64-NEXT:  .LBB36_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB36_3: # %endblock
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 15) nounwind
   ret i32 %m
 }
@@ -1093,13 +1215,26 @@
 ;
 ; X64-LABEL: length15_gt_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str+1, %esi
-; X64-NEXT:    movl $15, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    movabsq $3544952156018063160, %rax # imm = 0x3132333435363738
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    jne .LBB38_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movabsq $4051322327650219061, %rax # imm = 0x3839303132333435
+; X64-NEXT:    movq 7(%rdi), %rcx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    je .LBB38_3
+; X64-NEXT:  .LBB38_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:  .LBB38_3: # %endblock
+; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 15) nounwind
   %c = icmp sgt i32 %m, 0
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index b8cfe04..df05021 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -165,8 +165,36 @@
 
 define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp7(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
-; ALL-NEXT:    ret i32 [[CALL]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; ALL-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; ALL-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; ALL-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
+; ALL-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; ALL-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; ALL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; ALL-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; ALL-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
   ret i32 %call
@@ -304,9 +332,41 @@
 }
 
 define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp11(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp11(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp11(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
   ret i32 %call
@@ -356,27 +416,123 @@
 }
 
 define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp13(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp13(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp13(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 5
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 5
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
   ret i32 %call
 }
 
 define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp14(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp14(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp14(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 6
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 6
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
   ret i32 %call
 }
 
 define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp15(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp15(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 7
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 7
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
   ret i32 %call